数据质量评估与清洗
微调数据质量 > 数量。100 条高质量数据的效果可能胜过 10000 条噪声数据。
数据质量维度
graph TB
A[数据质量] --> B[正确性]
A --> C[一致性]
A --> D[多样性]
A --> E[完整性]
B --> B1[答案准确
无事实错误] C --> C1[格式统一
风格一致] D --> D1[场景覆盖
避免偏见] E --> E1[字段完整
无缺失值] style A fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
无事实错误] C --> C1[格式统一
风格一致] D --> D1[场景覆盖
避免偏见] E --> E1[字段完整
无缺失值] style A fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
质量评估框架
"""
微调数据质量评估器
"""
import re
from dataclasses import dataclass, field
from typing import Any
@dataclass
class QualityReport:
"""质量报告"""
total_samples: int = 0
valid_samples: int = 0
issues: dict[str, int] = field(default_factory=dict)
score: float = 0.0
@dataclass
class TrainingSample:
"""训练样本"""
instruction: str
input_text: str = ""
output: str = ""
metadata: dict = field(default_factory=dict)
class DataQualityChecker:
"""数据质量检查器"""
def __init__(
self,
min_instruction_len: int = 10,
max_instruction_len: int = 2000,
min_output_len: int = 5,
max_output_len: int = 4000,
):
self.min_instruction_len = min_instruction_len
self.max_instruction_len = max_instruction_len
self.min_output_len = min_output_len
self.max_output_len = max_output_len
def check_sample(self, sample: TrainingSample) -> list[str]:
"""检查单条样本"""
issues = []
# 长度检查
if len(sample.instruction) < self.min_instruction_len:
issues.append("instruction_too_short")
if len(sample.instruction) > self.max_instruction_len:
issues.append("instruction_too_long")
if len(sample.output) < self.min_output_len:
issues.append("output_too_short")
if len(sample.output) > self.max_output_len:
issues.append("output_too_long")
# 内容检查
if sample.instruction.strip() == sample.output.strip():
issues.append("identical_input_output")
if self._is_repetitive(sample.output):
issues.append("repetitive_output")
if self._has_placeholder(sample.output):
issues.append("contains_placeholder")
return issues
def evaluate_dataset(self, samples: list[TrainingSample]) -> QualityReport:
"""评估整个数据集"""
report = QualityReport(total_samples=len(samples))
all_instructions = set()
for sample in samples:
issues = self.check_sample(sample)
# 去重
if sample.instruction in all_instructions:
issues.append("duplicate_instruction")
all_instructions.add(sample.instruction)
if issues:
for issue in issues:
report.issues[issue] = report.issues.get(issue, 0) + 1
else:
report.valid_samples += 1
report.score = report.valid_samples / report.total_samples if report.total_samples else 0
return report
def _is_repetitive(self, text: str, threshold: float = 0.5) -> bool:
"""检测重复内容"""
sentences = text.split("。")
if len(sentences) < 3:
return False
unique = set(s.strip() for s in sentences if s.strip())
return len(unique) / len(sentences) < threshold
def _has_placeholder(self, text: str) -> bool:
"""检测占位符"""
placeholders = ["TODO", "FIXME", "PLACEHOLDER", "XXX", "Lorem ipsum"]
return any(p.lower() in text.lower() for p in placeholders)
数据清洗管道
"""
数据清洗管道
"""
import re
from dataclasses import dataclass
class DataCleaner:
"""数据清洗器"""
def clean_sample(self, sample: TrainingSample) -> TrainingSample | None:
"""清洗单条样本"""
instruction = self._clean_text(sample.instruction)
output = self._clean_text(sample.output)
if not instruction or not output:
return None
return TrainingSample(
instruction=instruction,
input_text=self._clean_text(sample.input_text),
output=output,
metadata=sample.metadata,
)
def clean_dataset(self, samples: list[TrainingSample]) -> list[TrainingSample]:
"""清洗整个数据集"""
cleaned = []
seen_instructions = set()
for sample in samples:
result = self.clean_sample(sample)
if result is None:
continue
# 去重
key = result.instruction.strip().lower()
if key in seen_instructions:
continue
seen_instructions.add(key)
cleaned.append(result)
return cleaned
def _clean_text(self, text: str) -> str:
"""基础文本清洗"""
# 移除多余空白
text = re.sub(r'\s+', ' ', text).strip()
# 移除特殊控制字符
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', text)
return text
常见质量问题
| 问题 | 影响 | 解决方案 |
|---|---|---|
| 重复样本 | 过拟合特定模式 | 去重(基于 instruction) |
| 标签噪声 | 学到错误知识 | 交叉验证 + 人工抽检 |
| 长度偏差 | 输出总是偏长/偏短 | 按长度分桶,均匀采样 |
| 指令模糊 | 任务理解不一致 | 明确化 + 添加约束 |
| 格式不一致 | JSON/Markdown 混用 | 统一输出格式模板 |
本章小结
| 要点 | 说明 |
|---|---|
| 质量 > 数量 | 100 条干净数据 > 10000 条噪声数据 |
| 自动化检查 | 长度/重复/占位符/去重 |
| 人工抽检 | 每批次抽检 5-10% |
| 清洗管道 | 标准化 → 去重 → 格式统一 → 质量评分 |
下一章:训练环境与超参数