使用 Qwen2.5-1.5B + QLoRA 微調情緒分類模型:訓練流程與評估結果實作
下面將每筆資料透過 _to_chat_text(...) 組成 ChatML 風格輸入
def _to_chat_text(tokenizer, instruction, user_input, output=None):
"""組成 ChatML 訓練 / 推理文本。output 為 None 時只到 assistant 起始。"""
messages = [{"role": "user", "content": f"{instruction}\n\n文本:{user_input}"}]
if output is not None:
messages.append({"role": "assistant", "content": output})
return tokenizer.apply_chat_template(messages, tokenize=False)
return tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
接下來是 LoRa 訓練
def cmd_train(args):
import torch
from datasets import Dataset
from peft import LoraConfig
from transformers import (AutoModelForCausalLM, AutoTokenizer,
BitsAndBytesConfig)
from trl import SFTConfig, SFTTrainer
tokenizer = AutoTokenizer.from_pretrained(args.model)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# 4-bit 量化載入(省顯存關鍵)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
args.model, quantization_config=bnb_config, device_map="auto"
)
lora_config = LoraConfig(
r=args.lora_r,
lora_alpha=args.lora_r * 2, # alpha 慣例設為 r 的 2 倍
# 除 q/v 外再調 k/o,覆蓋完整注意力投影 → 提升對相近情緒的區分力
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
lora_dropout=0.05,
task_type="CAUSAL_LM",
)
rows = _read_jsonl(args.data)
texts = [
_to_chat_text(tokenizer, r["instruction"], r["input"], r["output"])
for r in rows
]
dataset = Dataset.from_dict({"text": texts})
# 建立 SFT 訓練設定,包含 batch size、梯度累積、學習率、epoch 數、fp16、warmup、保存策略等
sft_config = SFTConfig(
output_dir=args.out,
per_device_train_batch_size=2, # 8G VRAM:降 batch 削峰值顯存、避免顯示驅動 TDR
gradient_accumulation_steps=8, # 等效 batch_size = 16(2×8,與原設定相同)
num_train_epochs=args.epochs,
learning_rate=2e-4,
fp16=True,
gradient_checkpointing=True,
warmup_ratio=0.03,
lr_scheduler_type="cosine",
logging_steps=10,
save_strategy="epoch",
max_length=args.max_seq_len,
dataset_text_field="text",
)
# 建立並啟動訓練
trainer = SFTTrainer(
model=model,
args=sft_config,
train_dataset=dataset,
peft_config=lora_config,
processing_class=tokenizer,
)
trainer.train()
# 儲存 LoRA adapter
trainer.save_model(args.out)
print(f"✓ adapter 已存到 {args.out}")
訓練結束后生成 emotion-lora 數據,如下:

下面是
def cmd_eval(args):
import torch
from peft import PeftModel
from sklearn.metrics import classification_report, confusion_matrix
from transformers import AutoModelForCausalLM, AutoTokenizer
# 讀取 tokenizer 和基礎模型
tokenizer = AutoTokenizer.from_pretrained(args.model)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
args.model, device_map="auto", torch_dtype=torch.float16
)
# 套用 LoRA adapter
model = PeftModel.from_pretrained(model, args.adapter)
model.eval()
# 讀測試資料
rows = _read_jsonl(args.data)
y_true, y_pred, latencies = [], [], []
for r in rows:
# 生成 prompt 並跑推理
prompt = _to_chat_text(tokenizer, r["instruction"], r["input"])
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
t0 = time.perf_counter()
with torch.no_grad():
out = model.generate(**inputs, max_new_tokens=16, do_sample=False)
latencies.append((time.perf_counter() - t0) * 1000)
gen = tokenizer.decode(
out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True
)
# 從生成結果抽取情緒標籤
pred = _extract_label(gen) or "未知"
y_true.append(r["output"])
y_pred.append(pred)
# 計算與輸出評估結果
print(classification_report(
y_true, y_pred, labels=EMOTIONS, target_names=EMOTIONS, zero_division=0
))
latencies.sort()
print(f"推理延遲:中位數 {latencies[len(latencies)//2]:.0f}ms / 條")
# 混淆矩陣 → 找最常混淆的類別對
cm = confusion_matrix(y_true, y_pred, labels=EMOTIONS)
worst, worst_n = None, 0
for i in range(len(EMOTIONS)):
for j in range(len(EMOTIONS)):
if i != j and cm[i][j] > worst_n:
worst_n, worst = cm[i][j], (EMOTIONS[i], EMOTIONS[j])
if worst:
print(f"最常混淆:{worst[0]} ↔ {worst[1]}({worst_n} 次)")
def _extract_label(text):
"""從模型輸出中抽取 6 類標籤之一;找不到回 None。"""
for emo in EMOTIONS:
if emo in text:
return emo
return None
測試后可以得到這樣的結果
| - | precision | recall | f1-score | support |
|---|---|---|---|---|
| 喜悅 | 0.79 | 1.00 | 0.88 | 11 |
| 悲傷 | 0.44 | 1.00 | 0.62 | 8 |
| 憤怒 | 1.00 | 0.40 | 0.57 | 10 |
| 恐懼 | 1.00 | 0.11 | 0.20 | 9 ← 幾乎全错 |
| 驚訝 | 0.75 | 0.75 | 0.75 | 8 |
| 厭惡 | 0.83 | 0.62 | 0.71 | 8 |
| macro avg | 0.80 | 0.65 | 0.62 | 54 |
各欄位代表意思
-
precision- 精確率
- 在模型預測為該類別的樣本中,有多少是真的該類別
- 公式:
TP / (TP + FP)
-
recall- 召回率,也叫查全率
- 在實際屬於該類別的樣本中,有多少被模型正確找到
- 公式:
TP / (TP + FN)
-
f1-score- F1 分數
- precision 和 recall 的調和平均
- 公式:
2 * precision * recall / (precision + recall) - 用來平衡精確率與召回率,當兩者不平衡時比單看一個更能反映整體效果
-
support- 這個類別在測試資料中實際出現的樣本數
- 也就是該類別的真實樣本數量