微调模型评估
微调完的模型到底好不好用?这一章给你一套科学、可复现的评估方法论。
评估体系架构
graph TD
MODEL[微调模型] --> AUTO[自动化评估]
MODEL --> HUMAN[人工评估]
MODEL --> AB[A/B 测试]
AUTO --> METRICS[指标计算]
AUTO --> BENCH[基准测试]
AUTO --> REG[回归测试]
METRICS --> BLEU[BLEU / ROUGE]
METRICS --> ACC[准确率 / F1]
METRICS --> PPL[困惑度]
HUMAN --> BLIND[盲评打分]
HUMAN --> COMPARE[对比评测]
HUMAN --> DOMAIN[领域专家评审]
AB --> ONLINE[线上 A/B]
AB --> SHADOW[影子模式]
style MODEL fill:#e3f2fd,stroke:#1565c0,stroke-width:2px
style AUTO fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style HUMAN fill:#c8e6c9,stroke:#388e3c,stroke-width:2px
style AB fill:#fce4ec,stroke:#c62828,stroke-width:2px
自动化评估指标
"""
微调模型自动化评估
"""
from dataclasses import dataclass, field
import json
@dataclass
class AutoEvalMetrics:
"""自动化评估指标体系"""
METRICS = {
"文本生成类": {
"BLEU": {
"用途": "翻译、摘要等有标准答案的任务",
"范围": "0-100,越高越好",
"优秀": "> 30",
"缺点": "不评估语义,只看 n-gram 重合",
},
"ROUGE-L": {
"用途": "摘要生成",
"范围": "0-1,越高越好",
"优秀": "> 0.4",
"缺点": "同上",
},
"BERTScore": {
"用途": "语义相似度",
"范围": "0-1,越高越好",
"优秀": "> 0.85",
"缺点": "计算慢",
},
},
"分类类": {
"准确率": {
"用途": "分类、意图识别",
"范围": "0-1",
"优秀": "> 0.9",
"缺点": "类别不均衡时失效",
},
"F1 Score": {
"用途": "不均衡分类",
"范围": "0-1",
"优秀": "> 0.85",
"缺点": "需要拆分 P/R 看",
},
},
"生成质量": {
"困惑度 PPL": {
"用途": "语言模型质量",
"范围": "越低越好",
"优秀": "< 10",
"缺点": "不直接反映任务表现",
},
"LLM-as-Judge": {
"用途": "GPT-4 自动评分",
"范围": "1-10",
"优秀": "> 7",
"缺点": "可能有偏好偏差",
},
},
}
metrics = AutoEvalMetrics()
print("=== 自动化评估指标 ===")
for category, items in metrics.METRICS.items():
print(f"\n📊 {category}:")
for name, info in items.items():
print(f" {name}: {info['用途']} (优秀线: {info['优秀']})")
评估实战代码
"""
评估实战代码
"""
import json
from pathlib import Path
class FineTuneEvaluator:
"""微调模型评估器"""
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer
def evaluate_generation(self, test_data: list[dict]) -> dict:
"""生成任务评估"""
from rouge_score import rouge_scorer
import numpy as np
scorer = rouge_scorer.RougeScorer(
["rouge1", "rouge2", "rougeL"], use_stemmer=False
)
scores = {"rouge1": [], "rouge2": [], "rougeL": []}
predictions = []
for sample in test_data:
# 生成预测
inputs = self.tokenizer(
sample["instruction"],
return_tensors="pt",
).to(self.model.device)
outputs = self.model.generate(
**inputs,
max_new_tokens=512,
temperature=0.1,
do_sample=False,
)
pred = self.tokenizer.decode(
outputs[0][inputs["input_ids"].shape[1]:],
skip_special_tokens=True,
)
predictions.append(pred)
# 计算 ROUGE
result = scorer.score(sample["output"], pred)
for key in scores:
scores[key].append(result[key].fmeasure)
return {
"rouge1": float(np.mean(scores["rouge1"])),
"rouge2": float(np.mean(scores["rouge2"])),
"rougeL": float(np.mean(scores["rougeL"])),
"num_samples": len(test_data),
}
def evaluate_with_llm_judge(
self, test_data: list[dict], judge_model: str = "gpt-4o"
) -> dict:
"""LLM-as-Judge 评估"""
from openai import OpenAI
client = OpenAI()
JUDGE_PROMPT = """请评估以下 AI 回答的质量,1-10 分。
评分维度:
- 准确性 (1-10)
- 完整性 (1-10)
- 清晰度 (1-10)
- 实用性 (1-10)
用户问题:{question}
AI 回答:{answer}
请以 JSON 格式返回:
{{"accuracy": N, "completeness": N, "clarity": N, "usefulness": N, "overall": N}}
"""
results = []
for sample in test_data:
# 模型生成回答
inputs = self.tokenizer(
sample["instruction"], return_tensors="pt"
).to(self.model.device)
outputs = self.model.generate(**inputs, max_new_tokens=512)
pred = self.tokenizer.decode(
outputs[0][inputs["input_ids"].shape[1]:],
skip_special_tokens=True,
)
# GPT-4 评分
response = client.chat.completions.create(
model=judge_model,
messages=[
{
"role": "user",
"content": JUDGE_PROMPT.format(
question=sample["instruction"],
answer=pred,
),
}
],
response_format={"type": "json_object"},
)
score = json.loads(
response.choices[0].message.content
)
results.append(score)
# 汇总
import numpy as np
avg_scores = {}
for key in ["accuracy", "completeness", "clarity", "usefulness", "overall"]:
avg_scores[key] = float(
np.mean([r[key] for r in results])
)
return avg_scores
def regression_test(
self,
test_data: list[dict],
baseline_scores: dict,
threshold: float = 0.05,
) -> dict:
"""回归测试:确保新模型不比基线差"""
current_scores = self.evaluate_generation(test_data)
regressions = []
for metric, baseline_val in baseline_scores.items():
if metric in current_scores:
current_val = current_scores[metric]
diff = current_val - baseline_val
if diff < -threshold:
regressions.append({
"metric": metric,
"baseline": baseline_val,
"current": current_val,
"diff": diff,
})
return {
"passed": len(regressions) == 0,
"regressions": regressions,
"current_scores": current_scores,
}
# 使用示例
print("""
# 完整评估流程
evaluator = FineTuneEvaluator(model, tokenizer)
# 1. 自动指标
auto_scores = evaluator.evaluate_generation(test_data)
print(f"ROUGE-L: {auto_scores['rougeL']:.3f}")
# 2. LLM 评分
llm_scores = evaluator.evaluate_with_llm_judge(test_data[:50])
print(f"Overall: {llm_scores['overall']:.1f}/10")
# 3. 回归测试
regression = evaluator.regression_test(test_data, baseline)
print(f"回归测试: {'✅ 通过' if regression['passed'] else '❌ 失败'}")
""")
人工评估方法
"""
人工评估设计
"""
class HumanEvaluation:
"""人工评估"""
METHODS = {
"盲评打分": {
"流程": [
"1. 准备 100+ 测试样本",
"2. 隐藏模型名称,随机编号",
"3. 3+ 位评估者独立打分 (1-5)",
"4. 计算一致性 (Cohen Kappa > 0.6)",
"5. 取均值作为最终分数",
],
"评分维度": [
"准确性", "流畅性", "相关性", "安全性",
],
},
"A/B 对比": {
"流程": [
"1. 同一问题,两个模型分别回答",
"2. 评估者选择更好的 (A/B/平手)",
"3. 统计胜率",
"4. 200+ 样本保证统计显著",
],
"评分维度": ["整体偏好", "分维度偏好"],
},
"Elo 评分": {
"流程": [
"1. 多模型循环对比",
"2. 按 Elo 算法更新分数",
"3. 类似 Chatbot Arena",
],
"评分维度": ["综合排名"],
},
}
EVALUATION_TEMPLATE = """
## 人工评估表
**样本编号**: ___
**评估者**: ___
**问题**: {question}
**模型回答**: {answer}
| 维度 | 1分 | 2分 | 3分 | 4分 | 5分 |
|------|-----|-----|-----|-----|-----|
| 准确性 | 错误 | 部分错误 | 基本正确 | 正确 | 完全正确 |
| 流畅性 | 不通顺 | 勉强 | 可以 | 流畅 | 优秀 |
| 相关性 | 无关 | 略相关 | 相关 | 切题 | 精准 |
| 安全性 | 有害 | 略不当 | 安全 | 适当 | 优秀 |
**总体评分**: ___ / 5
**备注**: ___
"""
he = HumanEvaluation()
print("=== 人工评估方法 ===")
for name, info in he.METHODS.items():
print(f"\n{name}:")
for step in info["流程"]:
print(f" {step}")
评估最佳实践
| 评估阶段 | 方法 | 样本量 | 频率 |
|---|---|---|---|
| 训练中 | 验证集 Loss / PPL | 全量 | 每 100 步 |
| 训练后 | 自动指标 (ROUGE) | 500+ | 每次训练 |
| 发布前 | LLM-as-Judge | 100+ | 每次发布 |
| 发布前 | 人工盲评 | 50+ | 重大版本 |
| 线上 | A/B 测试 | 1000+ | 持续 |
下一章:模型对比与选择——如何从多个候选模型中选出最优解。