文档加载与解析
RAG 系统的第一步是将各种格式的文档加载到系统中。文档处理的质量直接决定了后续检索和回答的效果。
常见文档格式
graph TB
A[文档来源] --> B[结构化]
A --> C[半结构化]
A --> D[非结构化]
B --> B1[数据库记录]
B --> B2[CSV / Excel]
B --> B3[JSON / XML]
C --> C1[Markdown]
C --> C2[HTML 网页]
C --> C3[PDF 含表格]
D --> D1[纯文本 TXT]
D --> D2[扫描件 PDF]
D --> D3[图片中的文字]
style A fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style B fill:#c8e6c9,stroke:#388e3c,stroke-width:2px
style C fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style D fill:#ffcdd2,stroke:#c62828,stroke-width:2px
文档加载器实现
通用文档加载框架
"""
通用文档加载框架
支持多种格式的文档加载
"""
from pathlib import Path
from dataclasses import dataclass, field
from abc import ABC, abstractmethod
@dataclass
class Document:
"""文档数据模型"""
content: str # 文本内容
metadata: dict = field(default_factory=dict) # 元数据
@property
def title(self) -> str:
return self.metadata.get("title", "Untitled")
@property
def source(self) -> str:
return self.metadata.get("source", "")
class BaseLoader(ABC):
"""文档加载器基类"""
@abstractmethod
def load(self, path: str) -> list[Document]:
"""加载文档"""
pass
def load_directory(self, directory: str, glob: str = "*") -> list[Document]:
"""加载目录下所有文档"""
docs = []
for file_path in Path(directory).rglob(glob):
if file_path.is_file():
try:
docs.extend(self.load(str(file_path)))
except Exception as e:
print(f"Warning: Failed to load {file_path}: {e}")
return docs
Markdown 加载器
"""Markdown 文档加载器"""
import re
from pathlib import Path
class MarkdownLoader(BaseLoader):
"""加载 Markdown 文件"""
def load(self, path: str) -> list[Document]:
content = Path(path).read_text(encoding="utf-8")
# 提取标题
title_match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
title = title_match.group(1) if title_match else Path(path).stem
# 提取 YAML frontmatter
metadata = self._extract_frontmatter(content)
metadata.update({
"source": path,
"title": title,
"format": "markdown"
})
# 清理 frontmatter
content = re.sub(r"^---\n.*?\n---\n", "", content, flags=re.DOTALL)
return [Document(content=content.strip(), metadata=metadata)]
def _extract_frontmatter(self, content: str) -> dict:
"""提取 YAML frontmatter"""
import yaml
match = re.match(r"^---\n(.*?)\n---", content, re.DOTALL)
if match:
try:
return yaml.safe_load(match.group(1)) or {}
except yaml.YAMLError:
return {}
return {}
# 使用
loader = MarkdownLoader()
docs = loader.load("docs/api-guide.md")
print(f"标题: {docs[0].title}")
print(f"内容长度: {len(docs[0].content)} 字符")
PDF 加载器
"""PDF 文档加载器"""
class PDFLoader(BaseLoader):
"""加载 PDF 文件,支持文本提取和 OCR"""
def __init__(self, use_ocr: bool = False):
self.use_ocr = use_ocr
def load(self, path: str) -> list[Document]:
"""加载 PDF 文件"""
# 方法一:使用 PyPDF2(轻量)
return self._load_with_pypdf(path)
def _load_with_pypdf(self, path: str) -> list[Document]:
"""使用 PyPDF2 提取文本"""
from PyPDF2 import PdfReader
reader = PdfReader(path)
documents = []
for i, page in enumerate(reader.pages):
text = page.extract_text()
if text.strip():
doc = Document(
content=text.strip(),
metadata={
"source": path,
"page": i + 1,
"total_pages": len(reader.pages),
"format": "pdf"
}
)
documents.append(doc)
return documents
def _load_with_ocr(self, path: str) -> list[Document]:
"""使用 OCR 处理扫描件(需要 pytesseract)"""
import pytesseract
from pdf2image import convert_from_path
images = convert_from_path(path)
documents = []
for i, image in enumerate(images):
text = pytesseract.image_to_string(image, lang="chi_sim+eng")
if text.strip():
doc = Document(
content=text.strip(),
metadata={
"source": path,
"page": i + 1,
"format": "pdf_ocr"
}
)
documents.append(doc)
return documents
# 使用
loader = PDFLoader()
docs = loader.load("reports/annual-report.pdf")
print(f"加载了 {len(docs)} 页")
for doc in docs[:3]:
print(f" 第 {doc.metadata['page']} 页: {doc.content[:50]}...")
HTML / 网页加载器
"""HTML / 网页加载器"""
import re
import requests
from bs4 import BeautifulSoup
class WebLoader(BaseLoader):
"""从网页加载内容"""
def __init__(self, timeout: int = 30):
self.timeout = timeout
def load(self, url: str) -> list[Document]:
"""加载网页内容"""
response = requests.get(url, timeout=self.timeout)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# 移除脚本和样式
for tag in soup(["script", "style", "nav", "footer", "header"]):
tag.decompose()
# 提取标题
title = soup.title.string if soup.title else ""
# 提取正文
text = soup.get_text(separator="\n", strip=True)
text = re.sub(r"\n{3,}", "\n\n", text) # 清理多余空行
return [Document(
content=text,
metadata={
"source": url,
"title": title,
"format": "html",
}
)]
# 使用
loader = WebLoader()
docs = loader.load("https://docs.python.org/3/tutorial/index.html")
print(f"标题: {docs[0].title}")
print(f"内容长度: {len(docs[0].content)} 字符")
统一加载入口
"""
统一文档加载入口
自动根据文件类型选择合适的加载器
"""
class UniversalLoader:
"""统一文档加载器"""
LOADERS = {
".md": MarkdownLoader,
".markdown": MarkdownLoader,
".pdf": PDFLoader,
".txt": lambda: TextLoader(),
".html": lambda: HTMLFileLoader(),
".htm": lambda: HTMLFileLoader(),
}
def load(self, path: str) -> list[Document]:
"""自动选择加载器并加载文档"""
ext = Path(path).suffix.lower()
if ext not in self.LOADERS:
raise ValueError(f"不支持的文件格式: {ext}")
loader_class = self.LOADERS[ext]
if callable(loader_class) and not isinstance(loader_class, type):
loader = loader_class()
else:
loader = loader_class()
return loader.load(path)
def load_directory(self, directory: str) -> list[Document]:
"""加载目录下所有支持的文档"""
docs = []
supported_exts = set(self.LOADERS.keys())
for file_path in Path(directory).rglob("*"):
if file_path.is_file() and file_path.suffix.lower() in supported_exts:
try:
loaded = self.load(str(file_path))
docs.extend(loaded)
print(f" ✓ {file_path.name} ({len(loaded)} 段)")
except Exception as e:
print(f" ✗ {file_path.name}: {e}")
print(f"\n共加载 {len(docs)} 个文档片段")
return docs
# 使用
loader = UniversalLoader()
all_docs = loader.load_directory("./knowledge_base/")
文档清洗
加载后的文档通常需要清洗,去除噪音:
"""
文档清洗工具
"""
import re
class DocumentCleaner:
"""文档清洗器"""
def clean(self, doc: Document) -> Document:
"""执行所有清洗步骤"""
content = doc.content
# 1. 去除多余空白
content = re.sub(r"[ \t]+", " ", content)
content = re.sub(r"\n{3,}", "\n\n", content)
# 2. 去除特殊字符
content = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]", "", content)
# 3. 去除页码
content = re.sub(r"\n\s*\d+\s*\n", "\n", content)
# 4. 去除过短的行(通常是噪音)
lines = content.split("\n")
lines = [line for line in lines if len(line.strip()) > 2 or line.strip() == ""]
content = "\n".join(lines)
# 5. 标准化引号和标点
content = content.replace('"', '"').replace('"', '"')
content = content.replace(''', "'").replace(''', "'")
doc.content = content.strip()
return doc
def remove_boilerplate(self, doc: Document) -> Document:
"""去除模板性内容(页眉页脚等)"""
lines = doc.content.split("\n")
# 去除常见的模板行
patterns = [
r"^confidential",
r"^page \d+ of \d+",
r"^all rights reserved",
r"^copyright",
]
filtered = []
for line in lines:
if not any(re.match(p, line.strip().lower()) for p in patterns):
filtered.append(line)
doc.content = "\n".join(filtered)
return doc
处理流程总结
graph LR
A[原始文档] --> B[格式识别]
B --> C[选择加载器]
C --> D[文本提取]
D --> E[元数据提取]
E --> F[文档清洗]
F --> G[质量检查]
G -->|通过| H[标准化文档]
G -->|不通过| I[人工审核]
style A fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style H fill:#c8e6c9,stroke:#388e3c,stroke-width:2px
本章小结
- 文档加载是 RAG 的第一步,决定了后续数据的质量
- 不同格式(Markdown、PDF、HTML)需要不同的加载策略
- 统一加载入口可以简化多格式文档处理
- 文档清洗是不可跳过的质量保障步骤
- PDF 扫描件需要 OCR 支持
下一章:我们将学习文本切分策略,这是影响 RAG 质量的关键步骤。