自建评估平台
为什么需要自建评估平台
开源工具各有所长,但企业级评估需要:统一入口、历史对比、权限管理、与内部系统集成。自建平台不是重造轮子,而是将最佳工具编排在一起。
graph TB
A[自建评估平台] --> B[数据管理]
A --> C[评估编排]
A --> D[结果可视化]
A --> E[集成接口]
B --> B1[数据集 CRUD]
B --> B2[版本管理]
B --> B3[标注工作流]
C --> C1[多工具调度]
C --> C2[并行评估]
C --> C3[定时任务]
D --> D1[仪表盘]
D --> D2[趋势图]
D --> D3[对比报告]
E --> E1[CI/CD Webhook]
E --> E2[Slack/飞书通知]
E --> E3[模型注册表 API]
style A fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style C fill:#c8e6c9,stroke:#388e3c,stroke-width:2px
平台架构设计
"""
评估平台核心架构
"""
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
class EvalStatus(Enum):
PENDING = "pending"
RUNNING = "running"
COMPLETED = "completed"
FAILED = "failed"
@dataclass
class EvalJob:
"""评估任务"""
job_id: str
model_name: str
dataset_name: str
evaluators: list[str]
status: EvalStatus = EvalStatus.PENDING
created_at: str = ""
completed_at: str = ""
results: dict = field(default_factory=dict)
def __post_init__(self):
if not self.created_at:
self.created_at = datetime.now().isoformat()
class EvalPlatform:
"""评估平台"""
def __init__(self):
self.jobs: dict[str, EvalJob] = {}
self.evaluator_registry: dict[str, callable] = {}
self.dataset_registry: dict[str, list] = {}
def register_evaluator(self, name: str, evaluator_fn):
"""注册评估器"""
self.evaluator_registry[name] = evaluator_fn
print(f" 已注册评估器: {name}")
def register_dataset(self, name: str, samples: list):
"""注册数据集"""
self.dataset_registry[name] = samples
print(f" 已注册数据集: {name} ({len(samples)} 条)")
def submit_job(self, model_name: str, dataset_name: str,
evaluators: list[str]) -> str:
"""提交评估任务"""
job_id = f"eval_{len(self.jobs)+1:04d}"
job = EvalJob(
job_id=job_id,
model_name=model_name,
dataset_name=dataset_name,
evaluators=evaluators,
)
self.jobs[job_id] = job
print(f" 已提交: {job_id} | 模型={model_name} 数据集={dataset_name}")
return job_id
def run_job(self, job_id: str) -> dict:
"""执行评估任务"""
job = self.jobs[job_id]
job.status = EvalStatus.RUNNING
print(f"\n 执行 {job_id}:")
dataset = self.dataset_registry.get(job.dataset_name, [])
results = {}
for eval_name in job.evaluators:
evaluator = self.evaluator_registry.get(eval_name)
if evaluator:
score = evaluator(dataset)
results[eval_name] = round(score, 4)
print(f" {eval_name}: {score:.4f}")
job.results = results
job.status = EvalStatus.COMPLETED
job.completed_at = datetime.now().isoformat()
return results
def compare_models(self, job_ids: list[str]) -> dict:
"""对比多次评估结果"""
comparison = {}
for job_id in job_ids:
job = self.jobs[job_id]
comparison[f"{job.model_name} ({job_id})"] = job.results
print("\n模型对比:")
metrics = set()
for r in comparison.values():
metrics.update(r.keys())
for metric in sorted(metrics):
print(f"\n {metric}:")
for model, results in comparison.items():
score = results.get(metric, "N/A")
bar = "█" * int(float(score) * 20) if isinstance(score, float) else ""
print(f" {model}: {score} {bar}")
return comparison
# 使用示例
platform = EvalPlatform()
# 注册评估器
platform.register_evaluator("accuracy", lambda data: 0.87)
platform.register_evaluator("faithfulness", lambda data: 0.92)
platform.register_evaluator("safety", lambda data: 0.98)
# 注册数据集
platform.register_dataset("qa-v2", [{"q": "退货政策?", "a": "30天"}] * 100)
# 提交评估
job1 = platform.submit_job("gpt-4o", "qa-v2", ["accuracy", "faithfulness", "safety"])
job2 = platform.submit_job("claude-3.5", "qa-v2", ["accuracy", "faithfulness", "safety"])
# 运行
platform.run_job(job1)
platform.run_job(job2)
# 对比
platform.compare_models([job1, job2])
评估报告设计
| 报告模块 | 包含内容 | 受众 |
|---|---|---|
| 摘要 | 总体得分、通过/失败、关键发现 | 管理层 |
| 详细指标 | 各维度得分、趋势对比 | 技术负责人 |
| 失败分析 | 未通过用例、错误模式 | 开发者 |
| 安全报告 | 安全测试、合规检查 | 安全团队 |
| 成本分析 | Token 消耗、费用统计 | 运营 |
"""
评估报告生成器
"""
from datetime import datetime
class EvalReportGenerator:
"""评估报告生成器"""
def __init__(self, job_results: dict, model_name: str):
self.results = job_results
self.model = model_name
def generate_markdown(self) -> str:
"""生成 Markdown 格式报告"""
report = []
report.append(f"# LLM 评估报告")
report.append(f"**模型**: {self.model}")
report.append(f"**时间**: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
# 总体评分
overall = sum(self.results.values()) / len(self.results) if self.results else 0
status = "✅ 通过" if overall >= 0.85 else "❌ 未通过"
report.append(f"\n## 总体评分: {overall:.2%} {status}")
# 各维度
report.append("\n## 详细指标\n")
report.append("| 指标 | 得分 | 状态 |")
report.append("|------|------|------|")
for metric, score in self.results.items():
s = "✅" if score >= 0.85 else "⚠️" if score >= 0.7 else "❌"
report.append(f"| {metric} | {score:.4f} | {s} |")
return "\n".join(report)
def generate_alert(self, threshold: float = 0.85) -> str | None:
"""生成告警消息"""
failed = {k: v for k, v in self.results.items() if v < threshold}
if not failed:
return None
alert = f"⚠️ 评估告警 [{self.model}]\n"
for metric, score in failed.items():
alert += f" • {metric}: {score:.4f} (阈值: {threshold})\n"
return alert
# 示例
report_gen = EvalReportGenerator(
{"accuracy": 0.87, "faithfulness": 0.92, "safety": 0.98, "latency": 0.73},
"gpt-4o"
)
print(report_gen.generate_markdown())
alert = report_gen.generate_alert()
if alert:
print(f"\n{alert}")
生产部署清单
graph LR
A[部署清单] --> B[评估通过]
A --> C[安全审计]
A --> D[性能基准]
A --> E[回滚计划]
B --> B1[核心指标 >85%]
B --> B2[无回归]
C --> C1[Injection 测试 <1%]
C --> C2[PII 泄露 0%]
D --> D1[P95 延迟达标]
D --> D2[并发 50 稳定]
E --> E1[快速切换版本]
E --> E2[监控告警]
style A fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style B fill:#c8e6c9,stroke:#388e3c,stroke-width:2px
本章小结
- 自建评估平台的价值在于统一编排、历史对比和系统集成
- 平台核心模块:数据管理、评估编排、结果可视化、集成接口
- 评估报告应按受众分层:摘要给管理层,详情给开发者
- 集成告警机制,在指标下降时自动通知相关人员
- 生产部署前必须通过完整的评估 + 安全 + 性能检查
延伸阅读:参考 LLM 生产实践指南了解模型部署和运维的最佳实践。