3 min read555 words

查询改写与扩展

用户的原始查询往往含糊、简短或使用非标准表述。查询改写（Query Rewriting）和查询扩展（Query Expansion）是提升检索召回率的关键技术。

查询改写技术全景

graph TB A[原始查询] --> B[查询分析] B --> C{改写策略} C --> D[同义词扩展] C --> E[HyDE 假设文档] C --> F[子问题分解] C --> G[Step-Back 抽象化] D --> H[扩展后查询集] E --> H F --> H G --> H H --> I[多路检索] I --> J[结果融合] style A fill:#e3f2fd,stroke:#1976d2,stroke-width:3px style C fill:#fff3e0,stroke:#f57c00,stroke-width:2px style J fill:#c8e6c9,stroke:#388e3c,stroke-width:3px

HyDE：假设文档嵌入

HyDE（Hypothetical Document Embeddings）是一种反直觉但非常有效的方法：先让 LLM 生成一个"假设性答案"，然后用这个答案去检索。

"""
HyDE 假设文档嵌入实现
"""
from dataclasses import dataclass, field
@dataclass
class HyDEConfig:
"""HyDE 配置"""
model: str = "gpt-4o-mini"
num_hypotheses: int = 3
temperature: float = 0.7
class HyDEQueryExpander:
"""HyDE 查询扩展器"""
HYPOTHESIS_PROMPT = """请根据以下问题，写一段可能包含答案的文档段落。
不需要真实准确，只需要包含相关术语和概念。
问题：{query}
假设文档段落："""
def __init__(self, llm_client, embed_client, config: HyDEConfig | None = None):
self.llm = llm_client
self.embedder = embed_client
self.config = config or HyDEConfig()
def expand(self, query: str) -> list[list[float]]:
"""
生成假设文档并转为向量
Returns:
包含原始查询和假设文档的向量列表
"""
# 原始查询向量
query_embedding = self.embedder.embed(query)
embeddings = [query_embedding]
# 生成多个假设文档
for _ in range(self.config.num_hypotheses):
hypothesis = self.llm.generate(
self.HYPOTHESIS_PROMPT.format(query=query),
temperature=self.config.temperature,
)
hyp_embedding = self.embedder.embed(hypothesis)
embeddings.append(hyp_embedding)
return embeddings

HyDE 原理

步骤	操作	说明
1	接收原始查询	用户的自然语言问题
2	LLM 生成假设答案	不需要准确，重在覆盖关键词
3	假设答案向量化	与文档在同一语义空间
4	用假设向量检索	比直接用问题向量效果更好
5	合并检索结果	原始查询 + 假设文档的结果取并集

子问题分解

复杂查询可以拆解为多个简单子问题，分别检索后合并答案。

"""
子问题分解策略
"""
from dataclasses import dataclass
@dataclass
class SubQuestion:
"""子问题"""
question: str
intent: str
dependencies: list[int] = field(default_factory=list)
class QueryDecomposer:
"""查询分解器"""
DECOMPOSE_PROMPT = """将以下复杂问题分解为 2-4 个独立的子问题。
每个子问题应该可以独立回答。
复杂问题：{query}
请以 JSON 格式返回：
[{{"question": "子问题1", "intent": "检索意图"}}, ...]"""
def __init__(self, llm_client):
self.llm = llm_client
def decompose(self, query: str) -> list[SubQuestion]:
"""分解复杂查询为子问题"""
import json
response = self.llm.generate(
self.DECOMPOSE_PROMPT.format(query=query)
)
try:
items = json.loads(response)
return [
SubQuestion(question=item["question"], intent=item["intent"])
for item in items
]
except (json.JSONDecodeError, KeyError):
# 解析失败时返回原始查询
return [SubQuestion(question=query, intent="original")]
class DecomposeRAGPipeline:
"""基于分解的 RAG Pipeline"""
def __init__(self, decomposer, retriever, generator):
self.decomposer = decomposer
self.retriever = retriever
self.generator = generator
def answer(self, query: str) -> dict:
"""分解 → 检索 → 合并 → 生成"""
# 分解子问题
sub_questions = self.decomposer.decompose(query)
print(f"分解为 {len(sub_questions)} 个子问题")
# 每个子问题独立检索
all_docs = []
sub_answers = []
for sq in sub_questions:
docs = self.retriever.search(sq.question, top_k=3)
all_docs.extend(docs)
sub_answers.append({
"question": sq.question,
"docs": [d["content"][:200] for d in docs],
})
# 去重
seen_ids = set()
unique_docs = []
for doc in all_docs:
if doc["id"] not in seen_ids:
seen_ids.add(doc["id"])
unique_docs.append(doc)
# 综合生成
answer = self.generator.generate(
query=query,
context=unique_docs,
sub_questions=sub_answers,
)
return {"answer": answer, "sub_questions": sub_answers, "sources": unique_docs}

Step-Back Prompting

Step-Back 策略先将具体问题抽象为更高层次的问题，检索更广泛的背景知识。

graph LR A["具体问题:
Python 3.12 的 GIL 改进是什么?"] --> B[Step-Back] B --> C["抽象问题:
Python GIL 的工作原理和演进历史"] C --> D[检索背景知识] A --> E[检索具体答案] D --> F[合并上下文] E --> F F --> G[生成最终回答] style B fill:#fff3e0,stroke:#f57c00,stroke-width:2px style G fill:#c8e6c9,stroke:#388e3c,stroke-width:2px

"""
Step-Back 查询抽象
"""
class StepBackRewriter:
"""Step-Back 查询改写器"""
STEP_BACK_PROMPT = """给定一个具体问题，生成一个更高层次的背景问题。
这个背景问题应该覆盖回答原问题所需的基础知识。
具体问题：{query}
背景问题："""
def __init__(self, llm_client):
self.llm = llm_client
def rewrite(self, query: str) -> tuple[str, str]:
"""
返回 (原始问题, 抽象问题)
"""
abstract_query = self.llm.generate(
self.STEP_BACK_PROMPT.format(query=query)
)
return query, abstract_query.strip()

查询改写策略对比

策略	适用场景	优势	劣势	延迟增加
同义词扩展	术语不统一	简单快速	覆盖有限	低
HyDE	问题与文档语义差距大	桥接 Q-A 语义鸿沟	需额外 LLM 调用	中
子问题分解	复杂多跳问题	全面覆盖	检索量倍增	高
Step-Back	需要背景知识的问题	补充上下文	可能引入噪声	中
多语言翻译	跨语言知识库	突破语言壁垒	翻译质量影响	中

实战：组合改写策略

"""
组合查询改写管道
"""
from dataclasses import dataclass
from enum import Enum
class RewriteStrategy(Enum):
ORIGINAL = "original"
HYDE = "hyde"
DECOMPOSE = "decompose"
STEP_BACK = "step_back"
@dataclass
class RewriteResult:
"""改写结果"""
strategy: RewriteStrategy
queries: list[str]
metadata: dict
class AdaptiveQueryRewriter:
"""自适应查询改写器"""
def __init__(self, llm_client, embed_client):
self.llm = llm_client
self.embed = embed_client
def classify_query(self, query: str) -> list[RewriteStrategy]:
"""根据查询特征选择改写策略"""
strategies = [RewriteStrategy.ORIGINAL]
word_count = len(query.split())
has_question_mark = "?" in query or "？" in query
# 短查询 → HyDE 补充语义
if word_count < 5:
strategies.append(RewriteStrategy.HYDE)
# 复杂查询 → 分解
if word_count > 15 or ("和" in query and has_question_mark):
strategies.append(RewriteStrategy.DECOMPOSE)
# 技术细节问题 → Step-Back
if any(kw in query for kw in ["为什么", "原理", "如何实现", "底层"]):
strategies.append(RewriteStrategy.STEP_BACK)
return strategies
def rewrite(self, query: str) -> list[RewriteResult]:
"""执行自适应改写"""
strategies = self.classify_query(query)
results = []
for strategy in strategies:
if strategy == RewriteStrategy.ORIGINAL:
results.append(RewriteResult(
strategy=strategy,
queries=[query],
metadata={},
))
elif strategy == RewriteStrategy.HYDE:
hypothesis = self.llm.generate(
f"请写一段描述以下问题答案的文字：{query}"
)
results.append(RewriteResult(
strategy=strategy,
queries=[hypothesis],
metadata={"original": query},
))
return results

本章小结

主题	要点
HyDE	生成假设文档向量，桥接问答语义鸿沟
子问题分解	复杂查询拆分，分别检索后合并
Step-Back	抽象化具体问题，补充背景知识
自适应改写	根据查询特征自动选择策略组合
实战建议	先评估基线，再逐步叠加改写策略

下一章：多路召回与融合策略