Graph RAG:图增强检索
High Contrast
Dark Mode
Light Mode
Sepia
Forest
2 min read346 words

Graph RAG:图增强检索

传统 RAG 基于文本块的独立检索,难以捕获文档间的关联关系。Graph RAG 通过构建知识图谱,让检索具有"推理"能力。

Graph RAG vs 传统 RAG

graph TB subgraph 传统RAG A1[查询] --> B1[向量检索] B1 --> C1[独立文本块] C1 --> D1[生成回答] end subgraph GraphRAG A2[查询] --> B2[实体识别] B2 --> C2[图谱遍历] C2 --> D2[关联文档 + 关系] D2 --> E2[生成回答] end style A1 fill:#ffebee,stroke:#c62828,stroke-width:2px style A2 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px style E2 fill:#c8e6c9,stroke:#388e3c,stroke-width:3px
维度 传统 RAG Graph RAG
检索单元 文本块 实体 + 关系 + 社区
上下文理解 局部(单块) 全局(跨文档关联)
多跳推理 困难 天然支持
全局摘要 无法 社区摘要
构建成本 高(需抽取实体关系)
适用场景 事实检索 关系推理、全局分析

知识图谱构建

"""
从文档中自动构建知识图谱
"""
from dataclasses import dataclass, field
@dataclass
class Entity:
"""实体"""
name: str
entity_type: str
description: str = ""
properties: dict = field(default_factory=dict)
@dataclass
class Relation:
"""关系"""
source: str
target: str
relation_type: str
weight: float = 1.0
properties: dict = field(default_factory=dict)
@dataclass
class KnowledgeGraph:
"""知识图谱"""
entities: dict[str, Entity] = field(default_factory=dict)
relations: list[Relation] = field(default_factory=list)
def add_entity(self, entity: Entity) -> None:
self.entities[entity.name] = entity
def add_relation(self, relation: Relation) -> None:
self.relations.append(relation)
def get_neighbors(self, entity_name: str, hops: int = 1) -> set[str]:
"""获取 N 跳邻居"""
visited = {entity_name}
frontier = {entity_name}
for _ in range(hops):
next_frontier = set()
for node in frontier:
for rel in self.relations:
if rel.source == node and rel.target not in visited:
next_frontier.add(rel.target)
elif rel.target == node and rel.source not in visited:
next_frontier.add(rel.source)
visited.update(next_frontier)
frontier = next_frontier
return visited - {entity_name}
class GraphBuilder:
"""LLM 驱动的图谱构建器"""
EXTRACT_PROMPT = """从以下文本中提取实体和关系。
文本:
{text}
请以 JSON 格式返回:
{{
"entities": [{{"name": "实体名", "type": "类型", "description": "描述"}}],
"relations": [{{"source": "源实体", "target": "目标实体", "type": "关系类型"}}]
}}"""
def __init__(self, llm_client):
self.llm = llm_client
def extract_from_chunk(self, text: str) -> tuple[list[Entity], list[Relation]]:
"""从文本块提取实体和关系"""
import json
response = self.llm.generate(self.EXTRACT_PROMPT.format(text=text))
try:
data = json.loads(response)
entities = [
Entity(name=e["name"], entity_type=e["type"], description=e.get("description", ""))
for e in data.get("entities", [])
]
relations = [
Relation(source=r["source"], target=r["target"], relation_type=r["type"])
for r in data.get("relations", [])
]
return entities, relations
except (json.JSONDecodeError, KeyError):
return [], []
def build(self, chunks: list[str]) -> KnowledgeGraph:
"""从文本块列表构建知识图谱"""
kg = KnowledgeGraph()
for i, chunk in enumerate(chunks):
entities, relations = self.extract_from_chunk(chunk)
for entity in entities:
kg.add_entity(entity)
for relation in relations:
kg.add_relation(relation)
print(f"  块 {i+1}/{len(chunks)}: {len(entities)} 实体, {len(relations)} 关系")
print(f"图谱构建完成: {len(kg.entities)} 实体, {len(kg.relations)} 关系")
return kg

社区检测与摘要

Graph RAG 的关键创新是社区摘要:将图谱按社区(Community)分组,为每个社区生成摘要,实现全局理解。

graph TB A[知识图谱] --> B[社区检测
Leiden 算法] B --> C1[社区 1] B --> C2[社区 2] B --> C3[社区 N] C1 --> D1[社区摘要 1] C2 --> D2[社区摘要 2] C3 --> D3[社区摘要 N] D1 --> E[全局索引] D2 --> E D3 --> E style A fill:#e3f2fd,stroke:#1976d2,stroke-width:2px style B fill:#fff3e0,stroke:#f57c00,stroke-width:2px style E fill:#c8e6c9,stroke:#388e3c,stroke-width:3px
"""
社区检测与摘要生成
"""
from dataclasses import dataclass
@dataclass
class Community:
"""社区"""
id: str
entities: list[str]
relations: list[Relation]
summary: str = ""
level: int = 0
class CommunityDetector:
"""图社区检测"""
def detect(self, kg: KnowledgeGraph) -> list[Community]:
"""
简化版社区检测(实际应使用 Leiden 或 Louvain 算法)
"""
# 构建邻接列表
adj: dict[str, set[str]] = {e: set() for e in kg.entities}
for rel in kg.relations:
if rel.source in adj and rel.target in adj:
adj[rel.source].add(rel.target)
adj[rel.target].add(rel.source)
# 贪心社区分配
visited = set()
communities = []
community_id = 0
for entity in kg.entities:
if entity in visited:
continue
# BFS 从当前实体出发
community_entities = []
queue = [entity]
while queue and len(community_entities) < 20:
node = queue.pop(0)
if node in visited:
continue
visited.add(node)
community_entities.append(node)
queue.extend(adj.get(node, set()) - visited)
if community_entities:
# 收集社区内的关系
entity_set = set(community_entities)
community_rels = [
r for r in kg.relations
if r.source in entity_set and r.target in entity_set
]
communities.append(Community(
id=f"c_{community_id}",
entities=community_entities,
relations=community_rels,
))
community_id += 1
return communities

Graph RAG 查询流程

"""
Graph RAG 查询引擎
"""
class GraphRAGEngine:
"""Graph RAG 查询引擎"""
def __init__(self, kg: KnowledgeGraph, communities: list[Community], llm_client):
self.kg = kg
self.communities = communities
self.llm = llm_client
def local_search(self, query: str, top_k: int = 5) -> list[dict]:
"""局部搜索:从实体出发,遍历邻居"""
# 提取查询中的实体
entities = self._extract_query_entities(query)
context_docs = []
for entity_name in entities:
if entity_name in self.kg.entities:
neighbors = self.kg.get_neighbors(entity_name, hops=2)
entity = self.kg.entities[entity_name]
context_docs.append({
"entity": entity_name,
"description": entity.description,
"neighbors": list(neighbors)[:10],
"relations": [
f"{r.source} --{r.relation_type}--> {r.target}"
for r in self.kg.relations
if r.source == entity_name or r.target == entity_name
],
})
return context_docs[:top_k]
def global_search(self, query: str) -> str:
"""全局搜索:基于社区摘要生成全局回答"""
# 用社区摘要作为上下文
summaries = [c.summary for c in self.communities if c.summary]
prompt = f"""基于以下知识社区摘要,回答问题。
社区摘要:
{chr(10).join(f'- {s}' for s in summaries[:10])}
问题:{query}
回答:"""
return self.llm.generate(prompt)
def _extract_query_entities(self, query: str) -> list[str]:
"""从查询提取实体(简化版)"""
found = []
for entity_name in self.kg.entities:
if entity_name.lower() in query.lower():
found.append(entity_name)
return found

本章小结

主题 要点
Graph RAG 结合知识图谱与 RAG,支持多跳推理
图谱构建 LLM 抽取实体关系,成本较高但效果好
社区摘要 Leiden 算法分组 + LLM 摘要,实现全局理解
局部/全局搜索 局部从实体出发遍历,全局用社区摘要
适用场景 复杂关系推理、跨文档分析、全局问答

下一章:Self-RAG 与自适应检索