文档智能处理
High Contrast
Dark Mode
Light Mode
Sepia
Forest
1 min read135 words

文档智能处理

PDF、扫描件、复杂表格——文档是企业场景中最常见的多模态数据。

文档处理流程

graph TB A[文档输入] --> B{文档类型?} B -->|原生 PDF| C[文本提取] B -->|扫描件/图片| D[OCR 识别] B -->|复杂版面| E[VLM 理解] C --> F[结构化解析] D --> F E --> F F --> G[表格提取] F --> H[图表解析] F --> I[文本分段] G --> J[结构化数据] H --> J I --> J J --> K[知识库/RAG] style A fill:#e3f2fd,stroke:#1976d2,stroke-width:2px style K fill:#c8e6c9,stroke:#388e3c,stroke-width:2px

文档解析方案

"""
文档解析方案对比与实践
"""
from dataclasses import dataclass
@dataclass
class ParseResult:
"""解析结果"""
text: str
tables: list[dict]
images: list[str]
metadata: dict
pages: int
class DocumentParser:
"""文档解析管理"""
SOLUTIONS = {
"PyMuPDF (fitz)": {
"类型": "开源库",
"能力": "文本提取、图片提取、表格",
"优势": "快速、轻量、无需 GPU",
"劣势": "复杂版面效果差",
"成本": "免费",
"适用": "原生 PDF 文本提取",
"代码": """
import fitz  # pip install PyMuPDF
doc = fitz.open("document.pdf")
for page in doc:
text = page.get_text("text")
# 或获取 Markdown 格式
md = page.get_text("markdown")
print(md)
# 提取表格
tables = page.find_tables()
for table in tables:
df = table.to_pandas()
print(df)
""",
},
"Marker": {
"类型": "开源工具",
"能力": "PDF → Markdown,保持格式",
"优势": "版面理解好、表格处理强",
"劣势": "需要 GPU 加速",
"成本": "免费",
"适用": "学术论文、技术文档",
"代码": """
# pip install marker-pdf
# marker_single document.pdf output_dir
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
models = create_model_dict()
converter = PdfConverter(artifact_dict=models)
rendered = converter("document.pdf")
# rendered.markdown 包含完整 Markdown
""",
},
"Claude PDF": {
"类型": "API",
"能力": "原生 PDF 理解、多页分析",
"优势": "最强理解能力、长文档支持",
"劣势": "成本较高",
"成本": "约 $0.01-0.05/页",
"适用": "需要理解内容的场景",
},
"GPT-4o Vision": {
"类型": "API",
"能力": "图片形式分析文档",
"优势": "理解复杂版面",
"劣势": "需转图片、成本高",
"成本": "约 $0.01-0.03/页",
"适用": "扫描件、复杂版面",
},
}
@classmethod
def recommend(cls, scenario: str) -> str:
recommendations = {
"简单PDF": "PyMuPDF (fitz)",
"学术论文": "Marker",
"扫描件": "GPT-4o Vision",
"合同分析": "Claude PDF",
"批量处理": "Marker",
"问答交互": "Claude PDF",
}
return recommendations.get(scenario, "PyMuPDF (fitz)")
# 打印方案对比
parser = DocumentParser()
print("=== 文档解析方案 ===")
for name, info in parser.SOLUTIONS.items():
print(f"\n{name} ({info['类型']}):")
print(f"  优势: {info['优势']}")
print(f"  成本: {info['成本']}")
print(f"  适用: {info['适用']}")

表格提取与理解

"""
表格提取与结构化
"""
class TableExtractor:
"""表格提取器"""
@staticmethod
def with_vlm_prompt() -> str:
"""使用 VLM 提取表格的最佳 Prompt"""
return """
请提取图片中表格的所有内容。
输出格式要求:
1. 使用 Markdown 表格格式
2. 保留合并单元格的逻辑关系
3. 数值保持原始精度
4. 表头明确标注
如果表格有多级表头,请用 JSON 格式:
{
"headers": ["列1", "列2"],
"rows": [
["值1", "值2"],
...
],
"notes": "表格下方的备注"
}
"""
@staticmethod
def with_pymupdf() -> str:
"""使用 PyMuPDF 提取表格"""
return """
import fitz
doc = fitz.open("document.pdf")
page = doc[0]
# 查找表格
tables = page.find_tables()
print(f"找到 {len(tables.tables)} 个表格")
for i, table in enumerate(tables.tables):
print(f"\\n表格 {i+1}:")
# 转为 Pandas DataFrame
df = table.to_pandas()
print(df.to_markdown())
"""
# 复杂表格处理策略
STRATEGIES = {
"简单表格": {
"方法": "PyMuPDF find_tables()",
"场景": "清晰边框、简单结构",
},
"合并单元格": {
"方法": "VLM + 结构化 Prompt",
"场景": "复杂表头、合并单元格",
},
"无边框表格": {
"方法": "VLM (高精度模式)",
"场景": "对齐式表格、无边框",
},
"嵌套表格": {
"方法": "分区域 VLM 分析",
"场景": "表格中嵌套子表",
},
}
print("=== 表格提取策略 ===")
for name, info in TableExtractor.STRATEGIES.items():
print(f"  {name}: {info['方法']} → {info['场景']}")

多页文档处理

"""
长文档分页处理
"""
class LongDocumentProcessor:
"""长文档处理器"""
def __init__(self, max_pages_per_batch: int = 5):
self.max_pages = max_pages_per_batch
def process_strategy(self, total_pages: int) -> dict:
"""确定处理策略"""
if total_pages <= 5:
return {
"strategy": "single_batch",
"description": "直接发送所有页面",
"batches": 1,
}
elif total_pages <= 50:
batches = (total_pages + self.max_pages - 1) // self.max_pages
return {
"strategy": "batch_processing",
"description": f"分 {batches} 批处理",
"batches": batches,
}
else:
return {
"strategy": "hierarchical",
"description": "先提取目录 → 按章节处理 → 汇总",
"batches": total_pages // self.max_pages + 2,
}
def estimate_cost(
self, total_pages: int, model: str = "claude-3.5-sonnet"
) -> dict:
"""估算处理成本"""
# 每页约 800 tokens (文本) 或 1500 tokens (图片)
pricing = {
"claude-3.5-sonnet": {"input": 3.0, "output": 15.0},
"gpt-4o": {"input": 2.5, "output": 10.0},
"gpt-4o-mini": {"input": 0.15, "output": 0.60},
}
price = pricing.get(model, pricing["gpt-4o"])
tokens_per_page = 1000
total_input = total_pages * tokens_per_page
total_output = total_pages * 200  # 每页约 200 token 输出
cost = (
total_input / 1_000_000 * price["input"]
+ total_output / 1_000_000 * price["output"]
)
return {
"pages": total_pages,
"model": model,
"input_tokens": total_input,
"output_tokens": total_output,
"cost": f"${cost:.4f}",
}
# 使用
processor = LongDocumentProcessor()
for pages in [3, 20, 100]:
strategy = processor.process_strategy(pages)
cost = processor.estimate_cost(pages)
print(f"\n{pages} 页文档:")
print(f"  策略: {strategy['description']}")
print(f"  成本: {cost['cost']}")

智能文档助手

"""
智能文档助手:上传文档 → 理解 → 问答
"""
class DocumentAssistant:
"""文档助手"""
def __init__(self):
self.documents: dict[str, dict] = {}
def ingest(self, doc_id: str, pages_text: list[str]) -> dict:
"""导入文档"""
self.documents[doc_id] = {
"pages": pages_text,
"page_count": len(pages_text),
"total_chars": sum(len(p) for p in pages_text),
}
return {
"doc_id": doc_id,
"pages": len(pages_text),
"chars": sum(len(p) for p in pages_text),
}
def query(self, doc_id: str, question: str) -> dict:
"""查询文档"""
doc = self.documents.get(doc_id)
if not doc:
return {"error": "文档不存在"}
# 构建 Prompt
context = "\n---\n".join(doc["pages"])
prompt = f"""基于以下文档内容回答问题。
文档内容:
{context[:5000]}
问题:{question}
请基于文档内容准确回答,如果文档中没有相关信息,请明确说明。"""
return {
"prompt": prompt,
"model": "gpt-4o-mini",
"context_length": len(context),
}
def summarize(self, doc_id: str) -> dict:
"""文档摘要"""
doc = self.documents.get(doc_id)
if not doc:
return {"error": "文档不存在"}
return {
"prompt": (
"请对以下文档进行结构化摘要:\n"
"1. 文档主题\n"
"2. 关键要点(bullet points)\n"
"3. 核心数据/结论\n"
"4. 一段话概述\n\n"
f"文档内容:\n{doc['pages'][0][:3000]}..."
),
"model": "gpt-4o-mini",
}
# 演示
assistant = DocumentAssistant()
# 模拟导入
result = assistant.ingest("report-2024", [
"第一季度销售报告\n销售额:1000万\n同比增长:15%",
"第二季度展望\n预计销售额:1200万\n重点产品:AI 解决方案",
])
print(f"导入文档: {result}")
# 查询
query = assistant.query("report-2024", "销售额是多少?")
print(f"查询 → model={query['model']}, context={query['context_length']} chars")

本章小结

工具 场景 成本 质量
PyMuPDF 原生 PDF 文本 免费 ⭐⭐⭐
Marker PDF → Markdown 免费 ⭐⭐⭐⭐
Claude PDF 文档理解问答 $0.01-0.05/页 ⭐⭐⭐⭐⭐
GPT-4o Vision 扫描件/复杂版面 $0.01-0.03/页 ⭐⭐⭐⭐

下一章:多模态 RAG 与跨模态检索。