Self-RAG 与自适应检索
传统 RAG 对每个查询都执行检索,但并非所有问题都需要外部知识。Self-RAG 让模型自主决定何时检索、检索什么、以及如何验证生成结果。
Self-RAG 核心思想
graph TB
A[用户查询] --> B{需要检索?}
B -->|是| C[执行检索]
B -->|否| D[直接生成]
C --> E[检索相关文档]
E --> F{文档相关?}
F -->|相关| G[基于文档生成]
F -->|不相关| H[丢弃,自主回答]
G --> I{回答忠实于文档?}
I -->|是| J[输出回答]
I -->|否| K[重新生成]
D --> J
style B fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style F fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style I fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style J fill:#c8e6c9,stroke:#388e3c,stroke-width:3px
Self-RAG 反思令牌
Self-RAG 引入四种特殊的反思令牌(Reflection Token)来控制生成过程:
| 令牌 | 含义 | 取值 | 作用 |
|---|---|---|---|
| Retrieve | 是否需要检索 | yes / no / continue | 控制检索触发 |
| IsRel | 文档是否相关 | relevant / irrelevant | 过滤噪声文档 |
| IsSup | 回答是否有文档支持 | fully / partially / no | 检查忠实度 |
| IsUse | 回答是否有用 | 5 / 4 / 3 / 2 / 1 | 评估回答质量 |
"""
Self-RAG 实现框架
"""
from dataclasses import dataclass
from enum import Enum
class RetrieveDecision(Enum):
YES = "yes"
NO = "no"
CONTINUE = "continue"
class RelevanceJudgment(Enum):
RELEVANT = "relevant"
IRRELEVANT = "irrelevant"
class SupportLevel(Enum):
FULLY = "fully_supported"
PARTIALLY = "partially_supported"
NO = "no_support"
@dataclass
class SelfRAGResult:
"""Self-RAG 生成结果"""
answer: str
retrieved: bool
relevance: RelevanceJudgment | None = None
support: SupportLevel | None = None
usefulness: int = 0
sources: list[dict] | None = None
class SelfRAGPipeline:
"""Self-RAG 管道"""
def __init__(self, llm_client, retriever, critic_llm=None):
self.llm = llm_client
self.retriever = retriever
self.critic = critic_llm or llm_client
def generate(self, query: str) -> SelfRAGResult:
"""Self-RAG 完整生成流程"""
# Step 1: 判断是否需要检索
need_retrieval = self._decide_retrieval(query)
if need_retrieval == RetrieveDecision.NO:
answer = self.llm.generate(f"请直接回答:{query}")
return SelfRAGResult(answer=answer, retrieved=False, usefulness=3)
# Step 2: 检索文档
docs = self.retriever.search(query, top_k=5)
# Step 3: 判断相关性,过滤不相关文档
relevant_docs = []
for doc in docs:
judgment = self._judge_relevance(query, doc["content"])
if judgment == RelevanceJudgment.RELEVANT:
relevant_docs.append(doc)
if not relevant_docs:
answer = self.llm.generate(f"请根据你的知识回答:{query}")
return SelfRAGResult(
answer=answer, retrieved=True,
relevance=RelevanceJudgment.IRRELEVANT,
)
# Step 4: 基于相关文档生成回答
context = "\n".join(d["content"] for d in relevant_docs[:3])
answer = self.llm.generate(
f"根据以下资料回答问题。\n资料:{context}\n问题:{query}\n回答:"
)
# Step 5: 验证忠实度
support = self._check_support(answer, context)
if support == SupportLevel.NO:
# 重新生成
answer = self.llm.generate(
f"严格根据以下资料回答,不要编造。\n资料:{context}\n问题:{query}\n回答:"
)
support = self._check_support(answer, context)
return SelfRAGResult(
answer=answer,
retrieved=True,
relevance=RelevanceJudgment.RELEVANT,
support=support,
usefulness=self._rate_usefulness(query, answer),
sources=relevant_docs,
)
def _decide_retrieval(self, query: str) -> RetrieveDecision:
"""判断是否需要检索"""
prompt = f"""判断以下问题是否需要查阅外部资料才能准确回答。
- 如果是常识或简单定义,回答 no
- 如果需要最新信息或专业细节,回答 yes
问题:{query}
判断(yes/no):"""
response = self.critic.generate(prompt).strip().lower()
if "yes" in response:
return RetrieveDecision.YES
return RetrieveDecision.NO
def _judge_relevance(self, query: str, doc_content: str) -> RelevanceJudgment:
"""判断文档与查询的相关性"""
prompt = f"""判断以下文档是否与问题相关。
问题:{query}
文档:{doc_content[:500]}
判断(relevant/irrelevant):"""
response = self.critic.generate(prompt).strip().lower()
if "relevant" in response:
return RelevanceJudgment.RELEVANT
return RelevanceJudgment.IRRELEVANT
def _check_support(self, answer: str, context: str) -> SupportLevel:
"""检查回答是否有文档支持"""
prompt = f"""判断回答是否完全基于提供的资料。
资料:{context[:500]}
回答:{answer}
判断(fully_supported/partially_supported/no_support):"""
response = self.critic.generate(prompt).strip().lower()
if "fully" in response:
return SupportLevel.FULLY
if "partially" in response:
return SupportLevel.PARTIALLY
return SupportLevel.NO
def _rate_usefulness(self, query: str, answer: str) -> int:
"""评估回答有用性(1-5分)"""
prompt = f"""给以下回答打分(1-5分)。
问题:{query}
回答:{answer}
分数:"""
response = self.critic.generate(prompt).strip()
try:
score = int(response[0])
return max(1, min(5, score))
except (ValueError, IndexError):
return 3
自适应检索策略
除了 Self-RAG,还有多种自适应检索策略:
graph TB
A[自适应检索] --> B[Self-RAG]
A --> C[CRAG
纠正性RAG] A --> D[Active RAG] A --> E[Adaptive RAG] B --> B1[模型自己判断
是否需要检索] C --> C1[检索后评估
纠正不相关结果] D --> D1[多轮迭代
渐进式检索] E --> E1[路由选择
检索策略] style A fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
纠正性RAG] A --> D[Active RAG] A --> E[Adaptive RAG] B --> B1[模型自己判断
是否需要检索] C --> C1[检索后评估
纠正不相关结果] D --> D1[多轮迭代
渐进式检索] E --> E1[路由选择
检索策略] style A fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
CRAG:纠正性 RAG
"""
Corrective RAG (CRAG) 实现
"""
from dataclasses import dataclass
@dataclass
class CRAGConfig:
"""CRAG 配置"""
confidence_threshold: float = 0.7
ambiguous_range: tuple[float, float] = (0.3, 0.7)
class CorrectiveRAG:
"""CRAG 纠正性检索生成"""
def __init__(self, retriever, llm_client, config: CRAGConfig | None = None):
self.retriever = retriever
self.llm = llm_client
self.config = config or CRAGConfig()
def generate(self, query: str) -> dict:
"""CRAG 生成流程"""
# Step 1: 初始检索
docs = self.retriever.search(query, top_k=5)
# Step 2: 评估检索质量
confidence = self._evaluate_retrieval(query, docs)
print(f"检索置信度: {confidence:.2f}")
# Step 3: 根据置信度选择策略
if confidence >= self.config.confidence_threshold:
# 高置信度 → 直接使用
action = "correct"
context_docs = docs
elif confidence <= self.config.ambiguous_range[0]:
# 低置信度 → 网络搜索补充
action = "incorrect"
context_docs = self._web_search_fallback(query)
else:
# 模糊区间 → 两者结合
action = "ambiguous"
web_docs = self._web_search_fallback(query)
refined = self._refine_documents(query, docs)
context_docs = refined + web_docs
context = "\n".join(d.get("content", "")[:300] for d in context_docs[:5])
answer = self.llm.generate(
f"根据以下资料回答:\n{context}\n\n问题:{query}\n回答:"
)
return {"answer": answer, "action": action, "confidence": confidence}
def _evaluate_retrieval(self, query: str, docs: list[dict]) -> float:
"""评估检索结果整体质量"""
if not docs:
return 0.0
scores = [d.get("score", 0.5) for d in docs]
return sum(scores[:3]) / min(3, len(scores))
def _web_search_fallback(self, query: str) -> list[dict]:
"""Web 搜索兜底(占位实现)"""
print(f" 触发 Web 搜索兜底: {query}")
return [{"content": f"Web search results for: {query}", "source": "web"}]
def _refine_documents(self, query: str, docs: list[dict]) -> list[dict]:
"""提炼文档,移除不相关部分"""
refined = []
for doc in docs:
prompt = f"从以下文档中提取与问题相关的部分。\n问题:{query}\n文档:{doc.get('content', '')[:500]}\n相关内容:"
relevant_part = self.llm.generate(prompt)
if len(relevant_part.strip()) > 20:
refined.append({"content": relevant_part, "source": doc.get("source", "")})
return refined
策略对比
| 策略 | 检索时机 | 质量控制 | 额外开销 | 特点 |
|---|---|---|---|---|
| Naive RAG | 每次都检索 | 无 | 低 | 简单但可能引入噪声 |
| Self-RAG | 模型自主决定 | 4种反思令牌 | 中 | 端到端训练 |
| CRAG | 每次检索 + 评估 | 置信度分层 | 中 | Web 搜索兜底 |
| Active RAG | 迭代式检索 | 每轮评估是否继续 | 高 | 适合复杂推理 |
| Adaptive RAG | 路由选择 | 分类器判断 | 低 | 灵活路由 |
本章小结
| 主题 | 要点 |
|---|---|
| Self-RAG | 4种反思令牌控制检索-生成全流程 |
| 检索决策 | 不是所有查询都需要检索 |
| 忠实度验证 | 生成后检查是否有文档支持 |
| CRAG | 检索后评估置信度,低则 Web 兜底 |
| 实战建议 | 先基线 RAG,识别瓶颈后再引入自适应 |
下一章:多模态 RAG