RAG 系统评估实战
RAG 评估的特殊性
RAG(检索增强生成)系统同时涉及检索和生成两个环节,评估必须分别覆盖,再进行端到端验证。
graph TB
A[RAG 评估体系] --> B[检索评估]
A --> C[生成评估]
A --> D[端到端评估]
B --> B1[检索准确率 Precision@K]
B --> B2[检索召回率 Recall@K]
B --> B3[MRR 平均倒数排名]
B --> B4[NDCG 归一化折损累计增益]
C --> C1[忠实度 Faithfulness]
C --> C2[相关性 Relevance]
C --> C3[完整性 Completeness]
C --> C4[无幻觉 No Hallucination]
D --> D1[答案正确率]
D --> D2[用户满意度]
D --> D3[端到端延迟]
style A fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style B fill:#c8e6c9,stroke:#388e3c,stroke-width:2px
style C fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
检索质量评估
"""
RAG 检索质量评估工具
"""
from dataclasses import dataclass
import math
@dataclass
class RetrievalResult:
"""检索结果"""
query: str
retrieved_ids: list[str] # 检索到的文档 ID
relevant_ids: list[str] # 真正相关的文档 ID(标注数据)
class RetrievalEvaluator:
"""检索质量评估器"""
@staticmethod
def precision_at_k(result: RetrievalResult, k: int = 5) -> float:
"""Precision@K:前 K 个结果中相关文档的比例"""
top_k = result.retrieved_ids[:k]
relevant_in_k = sum(1 for doc_id in top_k if doc_id in result.relevant_ids)
return relevant_in_k / k
@staticmethod
def recall_at_k(result: RetrievalResult, k: int = 5) -> float:
"""Recall@K:前 K 个结果覆盖了多少相关文档"""
top_k = result.retrieved_ids[:k]
relevant_in_k = sum(1 for doc_id in top_k if doc_id in result.relevant_ids)
return relevant_in_k / len(result.relevant_ids) if result.relevant_ids else 0
@staticmethod
def mrr(results: list[RetrievalResult]) -> float:
"""MRR:平均倒数排名"""
reciprocal_ranks = []
for result in results:
for i, doc_id in enumerate(result.retrieved_ids):
if doc_id in result.relevant_ids:
reciprocal_ranks.append(1.0 / (i + 1))
break
else:
reciprocal_ranks.append(0.0)
return sum(reciprocal_ranks) / len(reciprocal_ranks)
@staticmethod
def ndcg_at_k(result: RetrievalResult, k: int = 5) -> float:
"""NDCG@K:归一化折损累计增益"""
top_k = result.retrieved_ids[:k]
# DCG
dcg = 0.0
for i, doc_id in enumerate(top_k):
rel = 1.0 if doc_id in result.relevant_ids else 0.0
dcg += rel / math.log2(i + 2)
# Ideal DCG
ideal_rels = sorted(
[1.0 if doc_id in result.relevant_ids else 0.0 for doc_id in top_k],
reverse=True
)
idcg = sum(rel / math.log2(i + 2) for i, rel in enumerate(ideal_rels))
return dcg / idcg if idcg > 0 else 0.0
def evaluate_batch(self, results: list[RetrievalResult], k: int = 5) -> dict:
"""批量评估"""
precisions = [self.precision_at_k(r, k) for r in results]
recalls = [self.recall_at_k(r, k) for r in results]
ndcgs = [self.ndcg_at_k(r, k) for r in results]
mrr_score = self.mrr(results)
avg = lambda lst: sum(lst) / len(lst) if lst else 0
report = {
f"Precision@{k}": round(avg(precisions), 4),
f"Recall@{k}": round(avg(recalls), 4),
f"NDCG@{k}": round(avg(ndcgs), 4),
"MRR": round(mrr_score, 4),
}
print(f"\n检索评估报告 (K={k}, {len(results)} 条查询):")
for metric, value in report.items():
bar = "█" * int(value * 20)
print(f" {metric:>15}: {value:.4f} {bar}")
return report
# 示例:评估检索结果
test_results = [
RetrievalResult("退货政策", ["doc1", "doc3", "doc5", "doc7", "doc9"], ["doc1", "doc3", "doc8"]),
RetrievalResult("配送时间", ["doc2", "doc4", "doc1", "doc6", "doc8"], ["doc2", "doc6"]),
RetrievalResult("会员积分", ["doc10", "doc3", "doc7", "doc2", "doc5"], ["doc3", "doc7", "doc10"]),
]
evaluator = RetrievalEvaluator()
evaluator.evaluate_batch(test_results, k=5)
生成质量评估:忠实度与幻觉检测
| 评估维度 | 定义 | 评估方法 | 目标 |
|---|---|---|---|
| 忠实度 | 回答是否基于检索到的文档 | LLM Judge + NLI 模型 | >0.90 |
| 相关性 | 回答是否与问题相关 | 语义相似度 | >0.85 |
| 完整性 | 是否回答了问题的所有方面 | 要点匹配 | >0.80 |
| 幻觉率 | 包含检索文档中不存在的信息 | 事实核查 | <5% |
| 引用准确性 | 引用的来源是否准确 | 引用验证 | >95% |
"""
RAG 生成质量评估 — 忠实度检测
"""
class FaithfulnessEvaluator:
"""忠实度评估器"""
def __init__(self, llm_judge_fn=None):
self.llm_judge_fn = llm_judge_fn
def decompose_claims(self, answer: str) -> list[str]:
"""将回答分解为独立声明"""
# 简化:按句号分割
claims = [s.strip() for s in answer.split("。") if len(s.strip()) > 5]
return claims
def verify_claim(self, claim: str, context: str) -> dict:
"""验证单个声明是否被上下文支持"""
# 简化实现:检查关键词重叠
claim_words = set(claim)
context_words = set(context)
overlap = len(claim_words & context_words) / len(claim_words) if claim_words else 0
return {
"claim": claim,
"supported": overlap > 0.3,
"confidence": round(overlap, 2),
}
def evaluate(self, answer: str, contexts: list[str]) -> dict:
"""评估回答的忠实度"""
claims = self.decompose_claims(answer)
full_context = "\n".join(contexts)
results = []
supported_count = 0
for claim in claims:
result = self.verify_claim(claim, full_context)
results.append(result)
if result["supported"]:
supported_count += 1
faithfulness = supported_count / len(claims) if claims else 0
return {
"faithfulness_score": round(faithfulness, 4),
"total_claims": len(claims),
"supported_claims": supported_count,
"unsupported_claims": len(claims) - supported_count,
"details": results,
}
# 使用示例
evaluator = FaithfulnessEvaluator()
result = evaluator.evaluate(
answer="我们的退货期限是30天。商品需要保持原包装完好。退货运费由买家承担。",
contexts=[
"退货政策:客户可在购买后30天内申请退货,商品须保持原包装。",
"运费政策:退货运费由平台承担。",
]
)
print(f"忠实度: {result['faithfulness_score']:.2%}")
print(f"受支持声明: {result['supported_claims']}/{result['total_claims']}")
RAGAS 评估指标对照
graph LR
A[RAGAS 框架] --> B[Faithfulness]
A --> C[Answer Relevancy]
A --> D[Context Precision]
A --> E[Context Recall]
B --> B1[生成内容是否忠实于上下文]
C --> C1[回答是否与问题相关]
D --> D1[检索上下文是否精确]
E --> E1[检索是否覆盖所有相关信息]
style A fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
| RAGAS 指标 | 计算方式 | 适用场景 | 工具支持 |
|---|---|---|---|
| Faithfulness | 声明分解 → 逐条验证 | 所有 RAG 场景 | ragas, DeepEval |
| Answer Relevancy | 从答案生成问题 → 与原始问题比较 | 开放式 QA | ragas |
| Context Precision | 相关上下文排名评估 | 优化检索排序 | ragas, LlamaIndex |
| Context Recall | 参考答案中的信息是否都在上下文中 | 评估检索覆盖率 | ragas |
| Answer Correctness | 与标准答案对比 | 有标注的场景 | ragas, DeepEval |
本章小结
- RAG 评估需要分别覆盖检索质量和生成质量两个层面
- 检索评估使用 Precision@K、Recall@K、MRR、NDCG 等经典 IR 指标
- 生成评估以忠实度为核心,防止幻觉产生
- RAGAS 框架提供了标准化的 RAG 评估方法
- 端到端评估需结合自动指标和人工抽样
延伸阅读:查看 RAG 检索增强生成实战指南,了解完整的 RAG 系统构建方法。