评估驱动优化循环
评估不是终点,是起点。建立从"评估发现问题 → 定位根因 → 修复优化 → 再评估"的持续改进闭环,才能让 LLM 应用持续提升。
优化闭环架构
graph LR
A[部署运行] --> B[持续评估
Continuous Eval] B --> C{指标下降?} C -->|是| D[根因分析
Root Cause] C -->|否| A D --> E{数据问题?} D --> F{提示词问题?} D --> G{模型问题?} E --> H[更新训练/评估数据] F --> I[迭代系统提示词] G --> J[微调/换模型] H --> K[验证修复效果] I --> K J --> K K --> A style B fill:#e3f2fd,stroke:#1565c0,stroke-width:2px style D fill:#fff9c4,stroke:#f9a825,stroke-width:2px style K fill:#c8e6c9,stroke:#43a047,stroke-width:2px
Continuous Eval] B --> C{指标下降?} C -->|是| D[根因分析
Root Cause] C -->|否| A D --> E{数据问题?} D --> F{提示词问题?} D --> G{模型问题?} E --> H[更新训练/评估数据] F --> I[迭代系统提示词] G --> J[微调/换模型] H --> K[验证修复效果] I --> K J --> K K --> A style B fill:#e3f2fd,stroke:#1565c0,stroke-width:2px style D fill:#fff9c4,stroke:#f9a825,stroke-width:2px style K fill:#c8e6c9,stroke:#43a047,stroke-width:2px
评估驱动优化实现
from dataclasses import dataclass, field
from enum import Enum
from datetime import datetime
import json
class OptimizationTarget(Enum):
PROMPT = "prompt"
DATA = "data"
MODEL = "model"
PIPELINE = "pipeline"
@dataclass
class EvalCheckpoint:
"""评估快照——记录某版本的评估结果"""
version: str
timestamp: datetime
scores: dict[str, float] # {"accuracy": 0.82, "safety": 0.96, ...}
config: dict # 该版本的提示词/模型/数据配置
notes: str = ""
@property
def has_regression(self) -> bool:
"""是否有指标下降"""
return any(v < 0 for v in getattr(self, "_delta", {}).values())
def to_dict(self) -> dict:
return {
"version": self.version,
"timestamp": self.timestamp.isoformat(),
"scores": self.scores,
"notes": self.notes,
}
@dataclass
class OptimizationAction:
"""优化行动记录"""
action_id: str
target: OptimizationTarget
description: str
before_scores: dict[str, float]
after_scores: dict[str, float] = field(default_factory=dict)
def improvement(self) -> dict[str, float]:
"""计算各指标改善幅度"""
return {
k: round(self.after_scores.get(k, 0) - v, 4)
for k, v in self.before_scores.items()
}
@property
def net_positive(self) -> bool:
"""优化是否整体正向"""
improvements = self.improvement()
positives = sum(1 for v in improvements.values() if v > 0)
negatives = sum(1 for v in improvements.values() if v < -0.01)
return positives > negatives
class EvalOptimizationLoop:
"""评估驱动优化闭环管理器"""
def __init__(self, project_name: str, target_scores: dict[str, float]):
self.project_name = project_name
self.target_scores = target_scores # 各指标目标值
self.checkpoints: list[EvalCheckpoint] = []
self.actions: list[OptimizationAction] = []
def add_checkpoint(self, checkpoint: EvalCheckpoint) -> None:
"""记录评估快照"""
self.checkpoints.append(checkpoint)
print(f"[{checkpoint.version}] 评估快照记录: {checkpoint.scores}")
def detect_regressions(self, threshold: float = 0.02) -> list[str]:
"""
对比最新两个快照,检测指标回归
threshold: 下降超过此值才算回归
"""
if len(self.checkpoints) < 2:
return []
latest = self.checkpoints[-1].scores
previous = self.checkpoints[-2].scores
regressions = []
for metric, prev_val in previous.items():
curr_val = latest.get(metric, 0)
if prev_val - curr_val > threshold:
regressions.append(
f"{metric}: {prev_val:.3f} → {curr_val:.3f} (↓{prev_val - curr_val:.3f})"
)
return regressions
def diagnose_gap(self) -> dict[str, str]:
"""分析当前分数与目标的差距"""
if not self.checkpoints:
return {}
current = self.checkpoints[-1].scores
diagnosis = {}
for metric, target in self.target_scores.items():
current_val = current.get(metric, 0.0)
gap = target - current_val
if gap <= 0:
diagnosis[metric] = f"✅ 达标 ({current_val:.3f} ≥ {target:.3f})"
elif gap < 0.05:
diagnosis[metric] = f"⚠️ 接近目标 (差 {gap:.3f})"
else:
diagnosis[metric] = f"❌ 差距较大 (差 {gap:.3f},当前 {current_val:.3f})"
return diagnosis
def suggest_optimization_target(self) -> OptimizationTarget:
"""根据失败模式推荐优化方向"""
if not self.checkpoints:
return OptimizationTarget.PROMPT
current = self.checkpoints[-1].scores
# 简化启发式规则
if current.get("instruction_following", 1.0) < 0.7:
return OptimizationTarget.PROMPT
elif current.get("factual_accuracy", 1.0) < 0.7:
return OptimizationTarget.DATA
elif current.get("safety", 1.0) < 0.90:
return OptimizationTarget.MODEL
else:
return OptimizationTarget.PIPELINE
def export_report(self) -> str:
"""导出优化历程报告"""
report = {
"project": self.project_name,
"target_scores": self.target_scores,
"checkpoint_count": len(self.checkpoints),
"action_count": len(self.actions),
"latest_scores": self.checkpoints[-1].scores if self.checkpoints else {},
"regressions": self.detect_regressions(),
"gap_analysis": self.diagnose_gap(),
}
return json.dumps(report, ensure_ascii=False, indent=2)
# 示例
loop = EvalOptimizationLoop(
project_name="customer-service-bot-v2",
target_scores={
"accuracy": 0.85,
"safety": 0.95,
"instruction_following": 0.90,
}
)
# 第一次基线评估
loop.add_checkpoint(EvalCheckpoint(
version="v1.0-baseline",
timestamp=datetime.now(),
scores={"accuracy": 0.72, "safety": 0.88, "instruction_following": 0.80},
config={"model": "gpt-3.5-turbo", "system_prompt": "v1"},
))
# 优化提示词后重测
loop.add_checkpoint(EvalCheckpoint(
version="v1.1-prompt-improved",
timestamp=datetime.now(),
scores={"accuracy": 0.79, "safety": 0.93, "instruction_following": 0.88},
config={"model": "gpt-3.5-turbo", "system_prompt": "v2"},
notes="增加了拒绝不当请求的示例,提升安全性",
))
regressions = loop.detect_regressions()
print(f"回归指标: {regressions if regressions else '无回归'}")
gap = loop.diagnose_gap()
for metric, status in gap.items():
print(f" {metric}: {status}")
next_target = loop.suggest_optimization_target()
print(f"\n建议优化方向: {next_target.value}")
优化决策矩阵
| 问题症状 | 可能根因 | 推荐行动 | 预计改善 |
|---|---|---|---|
| 准确性持续低 | 训练数据噪声 | 清洗数据集 | +5–15% |
| 指令遵从差 | 提示词歧义 | 重写系统提示 | +10–20% |
| 安全指标下降 | 新攻击手法 | 更新防护提示词 | +5–10% |
| 延迟上升 | 模型版本升级 | 引入缓存/蒸馏 | 性能改善 |
| 特定场景差 | 缺少相关训练样本 | 构建场景数据集 | +5–20% |
本章小结
- 每次修改都要留快照——没有基线就无法衡量改善
- 回归阈值设为 2%——小于 2% 的波动可能是统计噪声
- 一次只改一个变量——同时修改提示词和模型,无法定位根因
- 优化是持续的,不是一次性的——设定季度性评估节奏
- 文档化每次优化的原因和结果——便于团队知识传承
下一章:开源评估工具对比