微调工具链与自动化
手动微调一次容易,持续迭代需要自动化流水线。本章构建从数据到部署的端到端管道。
微调 MLOps 全景
graph TB
A[数据管理] --> B[训练管道]
B --> C[评估管道]
C --> D[模型仓库]
D --> E[部署管道]
E --> F[监控反馈]
F --> A
style A fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style F fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
工具链对比
| 工具 | 定位 | 支持方法 | 易用性 | 生产就绪 |
|---|---|---|---|---|
| Axolotl | 全功能微调 | SFT/DPO/RLHF | ★★★★☆ | ★★★★★ |
| Unsloth | 高速微调 | SFT/DPO | ★★★★★ | ★★★★☆ |
| LLaMA-Factory | 中文友好 | SFT/DPO/PPO | ★★★★★ | ★★★★☆ |
| TRL | HuggingFace 官方 | SFT/DPO/PPO | ★★★☆☆ | ★★★★☆ |
| OpenRLHF | RLHF 专精 | PPO/DPO | ★★★☆☆ | ★★★☆☆ |
| MLX | Apple Silicon | SFT/LoRA | ★★★★☆ | ★★★☆☆ |
自动化训练配置
"""
微调自动化管道
"""
from dataclasses import dataclass, field
from enum import Enum
from typing import Any
import time
class PipelineStage(Enum):
DATA_VALIDATION = "data_validation"
PREPROCESSING = "preprocessing"
TRAINING = "training"
EVALUATION = "evaluation"
MODEL_REGISTRY = "model_registry"
DEPLOYMENT = "deployment"
@dataclass
class PipelineConfig:
"""训练管道配置"""
name: str
base_model: str = "meta-llama/Llama-3.1-8B-Instruct"
method: str = "lora" # lora / qlora / full
# 数据
train_data: str = ""
eval_data: str = ""
# 训练
lora_r: int = 16
learning_rate: float = 2e-4
num_epochs: int = 3
batch_size: int = 4
# 评估门槛
min_accuracy: float = 0.85
max_harmful_rate: float = 0.01
max_capability_drop: float = 0.02 # 通用能力下降 < 2%
# 自动部署
auto_deploy: bool = False
@dataclass
class StageResult:
"""阶段结果"""
stage: PipelineStage
status: str = "pending" # pending / running / passed / failed
duration_s: float = 0.0
metrics: dict = field(default_factory=dict)
error: str = ""
class FineTunePipeline:
"""微调自动化管道"""
def __init__(self, config: PipelineConfig):
self.config = config
self._results: dict[PipelineStage, StageResult] = {}
def run(self) -> dict:
"""执行完整管道"""
stages = [
(PipelineStage.DATA_VALIDATION, self._validate_data),
(PipelineStage.PREPROCESSING, self._preprocess),
(PipelineStage.TRAINING, self._train),
(PipelineStage.EVALUATION, self._evaluate),
(PipelineStage.MODEL_REGISTRY, self._register),
]
if self.config.auto_deploy:
stages.append((PipelineStage.DEPLOYMENT, self._deploy))
for stage, handler in stages:
result = self._run_stage(stage, handler)
if result.status == "failed":
break
return self.get_summary()
def _run_stage(self, stage: PipelineStage, handler) -> StageResult:
"""运行单个阶段"""
result = StageResult(stage=stage, status="running")
start = time.time()
try:
metrics = handler()
result.metrics = metrics
result.status = "passed"
except Exception as e:
result.status = "failed"
result.error = str(e)
result.duration_s = time.time() - start
self._results[stage] = result
return result
def _validate_data(self) -> dict:
"""数据验证"""
return {"samples": 1000, "valid_ratio": 0.95, "duplicates": 12}
def _preprocess(self) -> dict:
"""数据预处理"""
return {"train_samples": 950, "eval_samples": 100, "max_length": 2048}
def _train(self) -> dict:
"""训练"""
return {
"final_loss": 0.82,
"training_time_min": 45,
"gpu_used": "A100-80G",
}
def _evaluate(self) -> dict:
"""评估(含门控检查)"""
metrics = {
"accuracy": 0.91,
"harmful_rate": 0.003,
"capability_drop": 0.01,
"win_rate": 0.62,
}
# 门控检查
if metrics["accuracy"] < self.config.min_accuracy:
raise ValueError(f"准确率 {metrics['accuracy']} < 门槛 {self.config.min_accuracy}")
if metrics["harmful_rate"] > self.config.max_harmful_rate:
raise ValueError(f"有害率 {metrics['harmful_rate']} > 门槛 {self.config.max_harmful_rate}")
return metrics
def _register(self) -> dict:
"""注册模型"""
return {"model_id": f"{self.config.name}-v1", "registry": "model-hub"}
def _deploy(self) -> dict:
"""部署"""
return {"endpoint": f"https://api.example.com/{self.config.name}", "status": "live"}
def get_summary(self) -> dict:
"""获取管道摘要"""
return {
"pipeline": self.config.name,
"stages": {
stage.value: {
"status": result.status,
"duration_s": round(result.duration_s, 1),
"metrics": result.metrics,
}
for stage, result in self._results.items()
},
"overall": (
"passed" if all(r.status == "passed" for r in self._results.values())
else "failed"
),
}
CI/CD 集成
graph LR
A[Git Push
数据/配置变更] --> B[CI 触发
数据验证] B --> C[训练
GPU Runner] C --> D[评估
自动门控] D --> E{通过?} E -->|是| F[注册模型
Model Registry] E -->|否| G[通知 + 回滚] F --> H[Canary 部署
5% 流量] style A fill:#e3f2fd,stroke:#1976d2,stroke-width:3px style H fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
数据/配置变更] --> B[CI 触发
数据验证] B --> C[训练
GPU Runner] C --> D[评估
自动门控] D --> E{通过?} E -->|是| F[注册模型
Model Registry] E -->|否| G[通知 + 回滚] F --> H[Canary 部署
5% 流量] style A fill:#e3f2fd,stroke:#1976d2,stroke-width:3px style H fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
本章小结
| 要点 | 说明 |
|---|---|
| 推荐工具 | Axolotl(全功能)、Unsloth(快速)、LLaMA-Factory(中文) |
| 自动化管道 | 数据→训练→评估→注册→部署,全程门控 |
| 评估门控 | 准确率、有害率、通用能力退化,任一不达标即阻断 |
| CI/CD | Git push 触发,Canary 部署,支持自动回滚 |
延伸阅读:LLM 评估与测试指南 · DevOps 实战指南