PDF 与发票自动化处理
财务部门每月处理大量发票和 PDF 报表是最常见的自动化需求之一。
发票处理流程
graph TB
A[发票来源] --> B{格式}
B -->|PDF| C[PDF 解析]
B -->|图片| D[OCR 识别]
B -->|电子发票| E[XML 解析]
C --> F[字段抽取]
D --> F
E --> F
F --> G[数据校验]
G --> H{合格?}
H -->|是| I[写入系统]
H -->|否| J[人工复核]
I --> K[自动记账]
style A fill:#e3f2fd,stroke:#1565c0,stroke-width:2px
style I fill:#c8e6c9,stroke:#43a047,stroke-width:2px
style J fill:#ffcdd2,stroke:#c62828,stroke-width:2px
发票数据抽取
from dataclasses import dataclass
from datetime import date
import re
@dataclass
class InvoiceData:
invoice_number: str
vendor: str
amount: float
tax: float
date: str
items: list[dict]
confidence: float = 0.0
@property
def total(self) -> float:
return self.amount + self.tax
def validate(self) -> list[str]:
"""数据校验"""
errors = []
if not self.invoice_number:
errors.append("发票号缺失")
if self.amount <= 0:
errors.append("金额异常")
if self.confidence < 0.8:
errors.append(f"识别置信度过低: {self.confidence:.0%}")
if not re.match(r"\d{4}-\d{2}-\d{2}", self.date):
errors.append("日期格式异常")
return errors
class InvoiceExtractor:
"""发票字段抽取器"""
PATTERNS = {
"invoice_number": r"发票号[码]?\s*[::]\s*(\d{8,20})",
"amount": r"(?:金额|合计)\s*[::]\s*[¥¥]?\s*([\d,.]+)",
"tax": r"(?:税额|税金)\s*[::]\s*[¥¥]?\s*([\d,.]+)",
"date": r"(\d{4}[-/年]\d{1,2}[-/月]\d{1,2})",
"vendor": r"(?:销售方|供应商)\s*[::]\s*(.+?)(?:\n|$)",
}
def extract(self, text: str) -> InvoiceData:
"""从文本中抽取发票信息"""
fields = {}
for field_name, pattern in self.PATTERNS.items():
match = re.search(pattern, text)
fields[field_name] = match.group(1).strip() if match else ""
# 解析金额
def parse_amount(s: str) -> float:
if not s:
return 0.0
return float(s.replace(",", ""))
return InvoiceData(
invoice_number=fields.get("invoice_number", ""),
vendor=fields.get("vendor", ""),
amount=parse_amount(fields.get("amount", "")),
tax=parse_amount(fields.get("tax", "")),
date=fields.get("date", ""),
items=[],
confidence=sum(1 for v in fields.values() if v) / len(fields),
)
# 使用示例
extractor = InvoiceExtractor()
sample_text = """
增值税普通发票
发票号码:12345678901234567890
开票日期:2024-03-15
销售方:ABC科技有限公司
金额:¥10,000.00
税额:¥600.00
"""
invoice = extractor.extract(sample_text)
errors = invoice.validate()
print(f"发票: {invoice.invoice_number}, 合计: ¥{invoice.total:,.2f}")
print(f"校验: {'通过' if not errors else errors}")
PDF 批量处理管道
from dataclasses import dataclass, field
from pathlib import Path
@dataclass
class ProcessingResult:
filename: str
success: bool
invoice: InvoiceData | None = None
errors: list[str] = field(default_factory=list)
class PDFPipeline:
"""PDF 发票批量处理管道"""
def __init__(self, extractor: InvoiceExtractor):
self.extractor = extractor
self.results: list[ProcessingResult] = []
def process_batch(self, pdf_texts: dict[str, str]) -> dict:
"""批量处理多个PDF的文本内容"""
success_count = 0
fail_count = 0
total_amount = 0.0
for filename, text in pdf_texts.items():
invoice = self.extractor.extract(text)
errors = invoice.validate()
result = ProcessingResult(
filename=filename,
success=len(errors) == 0,
invoice=invoice,
errors=errors,
)
self.results.append(result)
if result.success:
success_count += 1
total_amount += invoice.total
else:
fail_count += 1
return {
"总数": len(pdf_texts),
"成功": success_count,
"失败(需人工)": fail_count,
"成功率": f"{success_count/len(pdf_texts):.0%}" if pdf_texts else "0%",
"总金额": f"¥{total_amount:,.2f}",
}
def export_to_csv_rows(self) -> list[dict]:
"""导出为CSV格式"""
rows = []
for r in self.results:
if r.success and r.invoice:
rows.append({
"文件": r.filename,
"发票号": r.invoice.invoice_number,
"供应商": r.invoice.vendor,
"金额": r.invoice.amount,
"税额": r.invoice.tax,
"合计": r.invoice.total,
"日期": r.invoice.date,
})
return rows
文档自动化场景对比
| 场景 | 输入格式 | 工具链 | 准确率 | 处理速度 |
|---|---|---|---|---|
| 增值税发票 | PDF/图片 | OCR + 正则 | 95%+ | 3秒/张 |
| 合同关键条款 | NLP 抽取 | 85% | 10秒/份 | |
| 银行对账单 | 表格解析 | 90% | 5秒/页 | |
| 报销单据 | 图片 | OCR + 分类 | 80% | 5秒/张 |
| 电子发票 | XML/OFD | 直接解析 | 99% | <1秒 |
本章小结
- 正则抽取适合结构化发票——格式固定时准确率高、速度快
- 置信度评分——字段识别率低于 80% 自动转人工
- 批量管道——统一处理 + 统计 + 导出 CSV
- 电子发票优先——XML/OFD 直接解析,无需 OCR
- 人工复核兜底——自动化 + 人工复核双保险
下一章:AI 知识库与智能邮件