多模型对比评估
在选型或版本迭代决策时,需要对多个模型进行横向比较。单一模型的绝对分数意义有限——相对排名才是决策依据。
ELO 排名系统
graph TB
A[多模型评估体系] --> B[绝对评分
Absolute Scoring] A --> C[相对排名
Pairwise Ranking] B --> B1[各模型独立打分
1–5 分量表] C --> C1[A vs B 盲测
ELO 积分计算] C1 --> D[ELO Rating] D --> E[胜率矩阵
Win-Rate Matrix] D --> F[Bootstrap CI
置信区间] style A fill:#ede7f6,stroke:#5e35b1,stroke-width:2px style C fill:#e3f2fd,stroke:#1565c0,stroke-width:2px style D fill:#c8e6c9,stroke:#43a047,stroke-width:2px
Absolute Scoring] A --> C[相对排名
Pairwise Ranking] B --> B1[各模型独立打分
1–5 分量表] C --> C1[A vs B 盲测
ELO 积分计算] C1 --> D[ELO Rating] D --> E[胜率矩阵
Win-Rate Matrix] D --> F[Bootstrap CI
置信区间] style A fill:#ede7f6,stroke:#5e35b1,stroke-width:2px style C fill:#e3f2fd,stroke:#1565c0,stroke-width:2px style D fill:#c8e6c9,stroke:#43a047,stroke-width:2px
ELO 积分实现
from dataclasses import dataclass, field
from enum import Enum
import math
import random
class MatchResult(Enum):
A_WINS = "a_wins"
B_WINS = "b_wins"
TIE = "tie"
@dataclass
class EloRating:
"""ELO 积分追踪器"""
model_name: str
initial_rating: float = 1000.0
k_factor: float = 32.0 # 更新幅度
rating: float = field(init=False)
wins: int = 0
losses: int = 0
ties: int = 0
def __post_init__(self):
self.rating = self.initial_rating
@property
def total_matches(self) -> int:
return self.wins + self.losses + self.ties
def win_rate(self) -> float:
if self.total_matches == 0:
return 0.0
return (self.wins + 0.5 * self.ties) / self.total_matches
def expected_score(self, opponent_rating: float) -> float:
"""计算预期得分(ELO 公式)"""
return 1.0 / (1.0 + math.pow(10, (opponent_rating - self.rating) / 400))
def update(self, opponent_rating: float, result: MatchResult) -> float:
"""更新 ELO 分数,返回变化量 delta"""
expected = self.expected_score(opponent_rating)
score = {
MatchResult.A_WINS: 1.0,
MatchResult.TIE: 0.5,
MatchResult.B_WINS: 0.0,
}[result]
delta = self.k_factor * (score - expected)
self.rating += delta
if result == MatchResult.A_WINS:
self.wins += 1
elif result == MatchResult.B_WINS:
self.losses += 1
else:
self.ties += 1
return delta
class ModelComparator:
"""多模型对比评估管理器"""
def __init__(self, k_factor: float = 32.0):
self.models: dict[str, EloRating] = {}
self.k_factor = k_factor
self.match_history: list[dict] = []
def register_model(self, name: str, initial_rating: float = 1000.0) -> None:
self.models[name] = EloRating(name, initial_rating, self.k_factor)
def record_match(
self,
model_a: str,
model_b: str,
result: MatchResult,
prompt_id: str = "",
) -> None:
"""记录一场对比结果并更新双方 ELO"""
elo_a = self.models[model_a]
elo_b = self.models[model_b]
rating_a_before = elo_a.rating
rating_b_before = elo_b.rating
# 对 A 的视角更新
elo_a.update(rating_b_before, result)
# 对 B 的视角更新(结果取反)
b_result = {
MatchResult.A_WINS: MatchResult.B_WINS,
MatchResult.B_WINS: MatchResult.A_WINS,
MatchResult.TIE: MatchResult.TIE,
}[result]
elo_b.update(rating_a_before, b_result)
self.match_history.append({
"prompt_id": prompt_id,
"model_a": model_a,
"model_b": model_b,
"result": result.value,
})
def leaderboard(self) -> list[dict]:
"""返回按 ELO 排名的排行榜"""
return sorted(
[
{
"rank": 0,
"model": m.model_name,
"elo": round(m.rating, 1),
"win_rate": round(m.win_rate(), 3),
"matches": m.total_matches,
}
for m in self.models.values()
],
key=lambda x: x["elo"],
reverse=True,
)
def win_matrix(self) -> dict[str, dict[str, float]]:
"""输出胜率矩阵"""
matrix: dict[str, dict[str, int]] = {
a: {b: 0 for b in self.models} for a in self.models
}
counts: dict[str, dict[str, int]] = {
a: {b: 0 for b in self.models} for a in self.models
}
for match in self.match_history:
a, b, result = match["model_a"], match["model_b"], match["result"]
counts[a][b] += 1
counts[b][a] += 1
if result == "a_wins":
matrix[a][b] += 1
elif result == "b_wins":
matrix[b][a] += 1
else:
matrix[a][b] += 1
matrix[b][a] += 1 # 平局双方各得 0.5(这里简化)
win_rates: dict[str, dict[str, float]] = {}
for a in self.models:
win_rates[a] = {}
for b in self.models:
if a == b:
win_rates[a][b] = 0.5
elif counts[a][b] > 0:
win_rates[a][b] = round(matrix[a][b] / counts[a][b], 3)
else:
win_rates[a][b] = 0.0
return win_rates
# 示例对比测试
comparator = ModelComparator(k_factor=32.0)
for model in ["GPT-4o", "Claude-3.7-Sonnet", "Gemini-1.5-Pro", "Qwen2.5-72B"]:
comparator.register_model(model)
# 模拟 100 组对比结果
test_results = [
("GPT-4o", "Qwen2.5-72B", MatchResult.A_WINS),
("Claude-3.7-Sonnet", "GPT-4o", MatchResult.TIE),
("Gemini-1.5-Pro", "Qwen2.5-72B", MatchResult.A_WINS),
("Claude-3.7-Sonnet", "Gemini-1.5-Pro", MatchResult.A_WINS),
("GPT-4o", "Gemini-1.5-Pro", MatchResult.TIE),
]
for a, b, r in test_results:
comparator.record_match(a, b, r)
print("\n📊 模型排行榜:")
for rank, entry in enumerate(comparator.leaderboard(), 1):
entry["rank"] = rank
print(f" #{entry['rank']} {entry['model']}: ELO={entry['elo']}, 胜率={entry['win_rate']:.1%}")
盲测设计原则
| 设计要素 | 说明 | 常见错误 |
|---|---|---|
| 随机化 | A/B 位置随机排列 | 固定 A=新模型会引入偏见 |
| 配对样本 | 同一 prompt 对比 | 不同 prompt 无可比性 |
| 评估维度 | 事先定义胜负标准 | 评估者凭感觉判断 |
| 样本量 | ≥500 对才能统计显著 | 50 对结论不可信 |
| 隐藏版本 | 评估者不知道模型来源 | 已知版本引入期望效应 |
本章小结
- ELO 比绝对分更稳定——不依赖打分标定,排名可跨批次比较
- 盲测是必须的——评估者知道版本后会产生确认偏见
- 胜率矩阵揭示弱点——A 模型可能赢了 B 但输给 C
- 需要配套的 prompt 集——对比的 prompt 要覆盖核心场景
- ELO K值调整策略——K=32 适合初始阶段,稳定后可降至 16
下一章:幻觉与偏见检测