1 min read269 words

完整评估案例：对话系统评估

将所有评估方法融合到一个完整案例中。以企业客服对话系统为例，展示从评估设计到结果闭环的全过程。

案例背景

graph TB A[企业客服对话系统] --> B[评估目标] B --> B1[回答准确性] B --> B2[上下文忠实度] B --> B3[用户满意度] B --> B4[安全合规性] B --> B5[响应效率] A --> C[评估方法] C --> C1[离线基准测试] C --> C2[LLM-as-a-Judge] C --> C3[幻觉检测] C --> C4[A/B 测试] C --> C5[用户反馈] style A fill:#e3f2fd,stroke:#1976d2,stroke-width:2px style B fill:#c8e6c9,stroke:#388e3c,stroke-width:2px style C fill:#fff3e0,stroke:#f57c00,stroke-width:2px

评估数据集构建

"""
Step 1: 构建评估数据集
"""
from dataclasses import dataclass
@dataclass
class EvalSample:
"""评估样本"""
question: str
expected_answer: str
context: list[str]
category: str
difficulty: str  # easy / medium / hard
class EvalDatasetBuilder:
"""评估数据集构建器"""
def __init__(self):
self.samples: list[EvalSample] = []
def add_sample(self, sample: EvalSample) -> None:
self.samples.append(sample)
def from_production_logs(
self, logs: list[dict], sample_size: int = 100
) -> None:
"""从生产日志抽样"""
import random
sampled = random.sample(logs, min(sample_size, len(logs)))
for log in sampled:
self.samples.append(EvalSample(
question=log["user_query"],
expected_answer=log.get("expert_answer", ""),
context=log.get("retrieved_docs", []),
category=log.get("category", "general"),
difficulty="medium",
))
def get_stats(self) -> dict:
"""数据集统计"""
categories = {}
difficulties = {}
for s in self.samples:
categories[s.category] = categories.get(s.category, 0) + 1
difficulties[s.difficulty] = difficulties.get(s.difficulty, 0) + 1
return {
"total": len(self.samples),
"categories": categories,
"difficulties": difficulties,
}
# 构建示例数据集
builder = EvalDatasetBuilder()
test_samples = [
EvalSample(
question="如何退换货？",
expected_answer="7天内无理由退换货，需保持商品完好，联系客服获取退货地址。",
context=[
"退换货政策：购买后7天内可无理由退换货。",
"退换货要求：商品需保持原包装，不影响二次销售。",
"退货流程：联系客服获取退货地址，寄回后3-5个工作日退款。",
],
category="售后",
difficulty="easy",
),
EvalSample(
question="你们的产品和竞品 X 比有什么优势？",
expected_answer="我们的产品在性价比、售后服务、技术支持方面有优势。",
context=[
"产品特点：高性价比、7x24小时售后、专属技术顾问。",
],
category="售前",
difficulty="hard",
),
EvalSample(
question="我的订单什么时候到？",
expected_answer="根据您的订单信息，预计3-5个工作日送达。",
context=[
"物流信息：标准快递3-5个工作日，加急快递1-2个工作日。",
],
category="物流",
difficulty="easy",
),
]
for s in test_samples:
builder.add_sample(s)
print(f"数据集统计: {builder.get_stats()}")

多维度评估引擎

"""
Step 2: 多维度评估引擎
"""
import re
class MultiDimensionEvaluator:
"""多维度评估引擎"""
def __init__(self):
self.dimensions = [
"accuracy",       # 准确性
"faithfulness",   # 忠实度
"completeness",   # 完整性
"safety",         # 安全性
"helpfulness",    # 有用性
]
def evaluate_sample(
self, sample: EvalSample, model_answer: str
) -> dict:
"""评估单个样本"""
scores = {}
# 1. 准确性 - 与标准答案的语义匹配
scores["accuracy"] = self._score_accuracy(
model_answer, sample.expected_answer
)
# 2. 忠实度 - 回答是否基于给定上下文
scores["faithfulness"] = self._score_faithfulness(
model_answer, sample.context
)
# 3. 完整性 - 是否覆盖了所有要点
scores["completeness"] = self._score_completeness(
model_answer, sample.expected_answer
)
# 4. 安全性 - 是否包含不当内容
scores["safety"] = self._score_safety(model_answer)
# 5. 有用性 - 用户是否会满意
scores["helpfulness"] = self._score_helpfulness(
model_answer, sample.question
)
# 综合分
scores["overall"] = round(
sum(scores.values()) / len(scores), 2
)
return scores
def _score_accuracy(
self, answer: str, expected: str
) -> float:
"""准确性评分（简化版：关键词覆盖）"""
expected_words = set(expected.replace("。", "").replace("，", " ").split())
answer_words = set(answer.replace("。", "").replace("，", " ").split())
if not expected_words:
return 0.5
overlap = len(expected_words & answer_words)
return round(min(overlap / max(len(expected_words), 1), 1.0), 2)
def _score_faithfulness(
self, answer: str, context: list[str]
) -> float:
"""忠实度评分"""
if not context:
return 0.5
context_text = " ".join(context)
# 检查答案中的关键信息是否在上下文中
answer_sentences = answer.split("。")
grounded = 0
total = 0
for sent in answer_sentences:
sent = sent.strip()
if len(sent) < 5:
continue
total += 1
# 简化：检查句子中的关键词是否在上下文中出现
words = [w for w in sent if len(w) > 1]
matches = sum(1 for w in words if w in context_text)
if matches > len(words) * 0.3:
grounded += 1
return round(grounded / max(total, 1), 2)
def _score_completeness(
self, answer: str, expected: str
) -> float:
"""完整性评分"""
# 检查预期答案中的要点是否都被覆盖
key_points = expected.split("，")
covered = sum(
1 for point in key_points
if any(kw in answer for kw in point.split() if len(kw) > 1)
)
return round(covered / max(len(key_points), 1), 2)
def _score_safety(self, answer: str) -> float:
"""安全性评分"""
unsafe_patterns = [
r"竞对|竞品.*(差|垃圾|不好)",
r"保证.*100%",
r"绝对不会",
r"[我我们]比.*好",
]
for pattern in unsafe_patterns:
if re.search(pattern, answer):
return 0.3
return 1.0
def _score_helpfulness(
self, answer: str, question: str
) -> float:
"""有用性评分"""
score = 0.5
# 长度合理
if 20 < len(answer) < 500:
score += 0.2
# 有具体信息（数字、步骤）
if re.search(r'\d+', answer):
score += 0.15
# 有引导性（建议用户下一步操作）
if any(w in answer for w in ["建议", "可以", "请"]):
score += 0.15
return round(min(score, 1.0), 2)
# 评估
evaluator = MultiDimensionEvaluator()
# 模拟模型回答
model_answers = [
"7天内可以无理由退换货，请保持商品原包装。联系客服获取退货地址，寄回后3-5个工作日内退款。",
"我们的产品性价比很高，提供7x24小时售后服务和专属技术顾问，这些是我们的核心优势。",
"根据标准快递的时效，您的订单预计3-5个工作日送达。如需加急，可以选择加急快递1-2个工作日到。",
]
print("=== 多维度评估结果 ===\n")
for sample, answer in zip(test_samples, model_answers):
scores = evaluator.evaluate_sample(sample, answer)
print(f"问题: {sample.question}")
print(f"  分类: {sample.category} | 难度: {sample.difficulty}")
for dim, score in scores.items():
bar = "█" * int(score * 10) + "░" * (10 - int(score * 10))
print(f"  {dim:15s} {bar} {score}")
print()

评估报告生成

"""
Step 3: 生成评估报告
"""
import json
from datetime import datetime
class EvalReportGenerator:
"""评估报告生成器"""
def __init__(self, experiment_name: str, model_name: str):
self.experiment_name = experiment_name
self.model_name = model_name
self.results: list[dict] = []
def add_result(
self,
question: str,
category: str,
scores: dict,
) -> None:
"""添加评估结果"""
self.results.append({
"question": question,
"category": category,
"scores": scores,
})
def generate_report(self) -> dict:
"""生成完整报告"""
# 总体统计
all_scores = {}
for r in self.results:
for dim, score in r["scores"].items():
if dim not in all_scores:
all_scores[dim] = []
all_scores[dim].append(score)
overall = {
dim: round(sum(scores) / len(scores), 3)
for dim, scores in all_scores.items()
}
# 分类统计
category_scores = {}
for r in self.results:
cat = r["category"]
if cat not in category_scores:
category_scores[cat] = []
category_scores[cat].append(r["scores"].get("overall", 0))
category_avg = {
cat: round(sum(scores) / len(scores), 3)
for cat, scores in category_scores.items()
}
# 薄弱环节
weak_dimensions = sorted(overall.items(), key=lambda x: x[1])[:3]
report = {
"experiment": self.experiment_name,
"model": self.model_name,
"timestamp": datetime.now().isoformat(),
"sample_count": len(self.results),
"overall_scores": overall,
"category_scores": category_avg,
"weak_points": [
{"dimension": d, "score": s} for d, s in weak_dimensions
],
"recommendation": self._generate_recommendations(
overall, weak_dimensions
),
}
return report
def _generate_recommendations(
self, overall: dict, weak: list
) -> list[str]:
"""生成改进建议"""
recs = []
if overall.get("faithfulness", 1) < 0.7:
recs.append("增强上下文约束：在 System Prompt 中强调仅基于上下文回答")
if overall.get("accuracy", 1) < 0.6:
recs.append("提升准确性：增加知识库覆盖范围或提升检索质量")
if overall.get("safety", 1) < 0.8:
recs.append("加强安全过滤：添加输出审核层，过滤不当内容")
if overall.get("completeness", 1) < 0.6:
recs.append("提升完整性：在 Prompt 中明确要求覆盖所有要点")
if overall.get("helpfulness", 1) < 0.7:
recs.append("提升有用性：增加具体信息和引导性建议")
if not recs:
recs.append("各项表现良好，建议保持现有配置并持续监控")
return recs
def print_report(self) -> None:
"""打印报告"""
report = self.generate_report()
print("╔" + "═" * 50 + "╗")
print(f"║  LLM 评估报告 - {report['experiment']}")
print(f"║  模型: {report['model']}")
print(f"║  样本数: {report['sample_count']}")
print(f"║  时间: {report['timestamp'][:19]}")
print("╠" + "═" * 50 + "╣")
print("║  总体评分:")
for dim, score in report["overall_scores"].items():
bar = "█" * int(score * 20) + "░" * (20 - int(score * 20))
status = "✅" if score >= 0.7 else "⚠️"
print(f"║    {status} {dim:15s} {bar} {score:.3f}")
print("║")
print("║  分类表现:")
for cat, score in report["category_scores"].items():
print(f"║    {cat}: {score:.3f}")
print("║")
print("║  改进建议:")
for i, rec in enumerate(report["recommendation"], 1):
print(f"║    {i}. {rec}")
print("╚" + "═" * 50 + "╝")
# 生成报告
report_gen = EvalReportGenerator(
experiment_name="customer_service_v2",
model_name="gpt-4o-mini",
)
for sample, answer in zip(test_samples, model_answers):
scores = evaluator.evaluate_sample(sample, answer)
report_gen.add_result(sample.question, sample.category, scores)
report_gen.print_report()

评估闭环流程

graph TB A[定义评估目标] --> B[构建测试集] B --> C[选择评估指标] C --> D[运行评估] D --> E[分析报告] E --> F{达标?} F -->|是| G[部署上线] F -->|否| H[优化改进] H --> I[调整 Prompt] H --> J[优化检索] H --> K[换模型] I --> D J --> D K --> D G --> L[持续监控] L --> M{指标下降?} M -->|是| N[回滚 + 新一轮评估] M -->|否| L N --> B style A fill:#e3f2fd,stroke:#1976d2,stroke-width:2px style G fill:#c8e6c9,stroke:#388e3c,stroke-width:2px style N fill:#ffcdd2,stroke:#c62828,stroke-width:2px

本章小结

完整的 LLM 评估流程：

构建数据集 - 从生产日志抽样 + 专家标注
多维评估 - 准确性、忠实度、完整性、安全性、有用性
自动化运行 - CI/CD 集成，每次改动自动跑评估
报告分析 - 定位薄弱环节，生成改进建议
持续迭代 - 评估→改进→再评估的闭环

关键心得：没有完美的评估方案，但有评估总比没有好。从简单开始，逐步完善。先用 LLM-as-a-Judge 快速起步，再引入人工评估校准，最终建立自动化评估流水线。