Agent 性能评估与优化
构建 Agent 容易,让它在生产中持续稳定、高质量地运行才是真正的挑战。本章介绍如何系统地评估和优化 Agent 性能。
评估体系设计
graph TB
A[Agent 评估体系] --> B[任务完成质量]
A --> C[运行效率]
A --> D[可靠性]
A --> E[成本]
B --> B1[任务成功率]
B --> B2[输出质量分]
B --> B3[用户满意度]
C --> C1[平均响应时间]
C --> C2[工具调用次数]
C --> C3[Token 消耗量]
D --> D1[错误率]
D --> D2[超时率]
D --> D3[幻觉发生率]
E --> E1[单次任务成本]
E --> E2[每日总支出]
E --> E3[成本/质量比]
style A fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
评估数据集构建
"""
Agent 评估数据集
"""
from dataclasses import dataclass, field
import json
import time
@dataclass
class EvalCase:
"""评估用例"""
case_id: str
task: str
expected_outcome: str
evaluation_criteria: list[str]
difficulty: str # easy / medium / hard
category: str # 分类(如 research / coding / analysis)
max_iterations: int = 10
timeout_seconds: int = 60
@dataclass
class EvalResult:
"""单次评估结果"""
case_id: str
task: str
output: str
success: bool
quality_score: float # 0-1
iterations_used: int
tokens_used: int
latency_seconds: float
cost_usd: float
error: str = ""
class AgentEvaluator:
"""Agent 评估器"""
def __init__(self, agent, judge_llm=None):
self.agent = agent
self.judge_llm = judge_llm
self.results: list[EvalResult] = []
def run_eval(self, cases: list[EvalCase]) -> dict:
"""运行完整评估"""
print(f"开始评估 {len(cases)} 个用例...")
for i, case in enumerate(cases, 1):
print(f"\n[{i}/{len(cases)}] {case.category} - {case.task[:40]}...")
result = self._run_single(case)
self.results.append(result)
status = "✅" if result.success else "❌"
print(f" {status} 质量={result.quality_score:.2f}, "
f"延迟={result.latency_seconds:.1f}s, "
f"成本=${result.cost_usd:.4f}")
return self._compute_metrics()
def _run_single(self, case: EvalCase) -> EvalResult:
"""运行单个用例"""
start = time.time()
try:
output = self.agent.run(case.task)
latency = time.time() - start
quality = self._judge_quality(case, output)
return EvalResult(
case_id=case.case_id,
task=case.task,
output=output,
success=quality >= 0.7,
quality_score=quality,
iterations_used=getattr(self.agent, 'last_iterations', 1),
tokens_used=getattr(self.agent, 'last_tokens', 0),
latency_seconds=latency,
cost_usd=getattr(self.agent, 'last_cost', 0.0),
)
except Exception as e:
return EvalResult(
case_id=case.case_id,
task=case.task,
output="",
success=False,
quality_score=0.0,
iterations_used=0,
tokens_used=0,
latency_seconds=time.time() - start,
cost_usd=0.0,
error=str(e),
)
def _judge_quality(self, case: EvalCase, output: str) -> float:
"""LLM Judge 评估输出质量"""
if not self.judge_llm or not output:
return 0.5
criteria = "\n".join(f"- {c}" for c in case.evaluation_criteria)
prompt = f"""评估以下 Agent 输出的质量(0.0-1.0 分)。
任务: {case.task}
评估标准:
{criteria}
Agent 输出:
{output[:1000]}
请输出 JSON: {{"score": 0.85, "reason": "简短说明"}}"""
# 实际调用 Judge LLM
return 0.8 # 示例占位
def _compute_metrics(self) -> dict:
"""计算汇总指标"""
if not self.results:
return {}
total = len(self.results)
passed = sum(1 for r in self.results if r.success)
return {
"total_cases": total,
"success_rate": round(passed / total, 3),
"avg_quality": round(sum(r.quality_score for r in self.results) / total, 3),
"avg_latency_s": round(sum(r.latency_seconds for r in self.results) / total, 2),
"avg_tokens": round(sum(r.tokens_used for r in self.results) / total),
"total_cost_usd": round(sum(r.cost_usd for r in self.results), 4),
"error_rate": round(sum(1 for r in self.results if r.error) / total, 3),
}
常见性能瓶颈与优化
"""
Agent 性能优化策略
"""
OPTIMIZATION_GUIDE = {
"减少 LLM 调用次数": {
"问题": "每次工具调用都需要 LLM 决策,调用链太长",
"优化": [
"合并相似的小任务,一次提交给 LLM",
"用 ReAct one-shot 提示减少规划步骤",
"为常见任务预构建工具链模板",
],
"效果": "迭代次数减少 30-50%",
},
"智能缓存": {
"问题": "相同问题重复调用,浪费 Token",
"优化": [
"对工具调用结果缓存(如搜索、API 查询)",
"使用语义相似度缓存 LLM 响应",
"为热点数据预热缓存",
],
"效果": "重复任务成本降至 0",
},
"模型分层路由": {
"问题": "所有步骤都用 GPT-4o,成本高",
"优化": [
"规划/分解用强模型(gpt-4o)",
"工具调用决策用快速模型(gpt-4o-mini)",
"简单格式转换用规则处理",
],
"效果": "成本降低 60%,质量损失 < 5%",
},
"并行执行": {
"问题": "工具调用串行,总延迟 = 每步之和",
"优化": [
"识别无依赖关系的工具调用,并行执行",
"异步 I/O 处理网络请求",
"使用线程池处理 CPU 密集任务",
],
"效果": "延迟降低 50-70%",
},
}
for name, info in OPTIMIZATION_GUIDE.items():
print(f"\n{name}")
print(f" 问题: {info['问题']}")
for opt in info["优化"]:
print(f" - {opt}")
print(f" 效果: {info['效果']}")
关键指标基准
| 指标 | 优秀 | 良好 | 需优化 |
|---|---|---|---|
| 任务成功率 | > 85% | 70-85% | < 70% |
| 平均响应时间 | < 10s | 10-30s | > 30s |
| 平均迭代次数 | < 5 | 5-10 | > 10 |
| 单任务成本 | < $0.01 | $0.01-$0.05 | > $0.05 |
| 幻觉发生率 | < 2% | 2-5% | > 5% |
本章实践清单
- [ ] 建立至少 50 个覆盖不同场景的评估用例
- [ ] 为每个用例定义明确的评估标准(避免主观评估)
- [ ] 每次代码变更后自动运行评估,监控指标回归
- [ ] 识别成功率最低的任务类别,优先优化
- [ ] 建立成本预算告警,单日超支自动降级到小模型
- [ ] 定期(每月)更新评估集,加入用户真实失败案例
下一章:企业实践——生产部署、成本控制与监控体系。