指令微调与对齐技术
从 SFT 到 RLHF 到 DPO——让模型不仅会回答,还能按人类期望回答。
对齐技术演进
graph LR
PT[预训练] --> SFT[监督微调 SFT]
SFT --> RLHF[RLHF]
SFT --> DPO[DPO]
SFT --> ORPO[ORPO]
RLHF --> RM[训练奖励模型]
RM --> PPO[PPO 强化学习]
DPO --> DIRECT[直接偏好优化]
PPO --> ALIGNED[对齐的模型]
DIRECT --> ALIGNED
ORPO --> ALIGNED
style SFT fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style DPO fill:#c8e6c9,stroke:#388e3c,stroke-width:2px
style ALIGNED fill:#fff3e0,stroke:#f57c00,stroke-width:2px
监督微调 (SFT)
"""
SFT: Supervised Fine-Tuning
第一步:教模型按指令做事
"""
class SFTExplainer:
"""SFT 详解"""
OVERVIEW = {
"目的": "教模型理解指令并给出有用回答",
"数据": "instruction-output 对",
"方法": "标准交叉熵损失训练",
"效果": "从基座模型变成对话模型",
}
# 使用 TRL 的 SFTTrainer
CODE_EXAMPLE = """
from trl import SFTTrainer, SFTConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-8B",
torch_dtype="bfloat16",
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
dataset = load_dataset("json", data_files="sft_data.jsonl", split="train")
# SFT 配置
sft_config = SFTConfig(
output_dir="./sft-output",
num_train_epochs=3,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-5,
max_seq_length=2048,
packing=True, # 数据打包提高效率
dataset_text_field="text",
)
trainer = SFTTrainer(
model=model,
train_dataset=dataset,
args=sft_config,
tokenizer=tokenizer,
)
trainer.train()
"""
sft = SFTExplainer()
print("=== SFT 概述 ===")
for k, v in sft.OVERVIEW.items():
print(f" {k}: {v}")
RLHF 强化学习
"""
RLHF: Reinforcement Learning from Human Feedback
训练奖励模型 → PPO 优化
"""
class RLHFExplainer:
"""RLHF 详解"""
PIPELINE = {
"Step 1 - SFT": "先做监督微调,得到基础对话模型",
"Step 2 - 奖励模型 (RM)": (
"用人类偏好数据训练: "
"给同一问题的两个回答标注好坏 → "
"模型学习评分"
),
"Step 3 - PPO 训练": (
"用奖励模型的评分作为奖励信号,"
"通过 PPO 算法优化 SFT 模型"
),
}
# RM 数据格式
RM_DATA_FORMAT = {
"prompt": "如何学习编程?",
"chosen": "建议从 Python 开始,先掌握基础语法...",
"rejected": "去学就行了。",
}
CODE_EXAMPLE = """
from trl import RewardTrainer, PPOTrainer, PPOConfig
from transformers import AutoModelForSequenceClassification
# Step 1: 训练奖励模型
reward_model = AutoModelForSequenceClassification.from_pretrained(
"meta-llama/Llama-3.1-8B",
num_labels=1,
)
reward_trainer = RewardTrainer(
model=reward_model,
train_dataset=preference_dataset, # chosen vs rejected
# ...
)
reward_trainer.train()
# Step 2: PPO 训练
ppo_config = PPOConfig(
learning_rate=1e-5,
batch_size=16,
mini_batch_size=4,
ppo_epochs=4,
)
ppo_trainer = PPOTrainer(
config=ppo_config,
model=sft_model,
ref_model=sft_model, # 参考模型防止偏离
reward_model=reward_model,
tokenizer=tokenizer,
)
"""
PROS_CONS = {
"优点": [
"效果最好,GPT-4 等顶级模型都用",
"能学到细微的人类偏好",
],
"缺点": [
"流程复杂(三阶段)",
"需要大量偏好数据",
"训练不稳定",
"成本高",
],
}
rlhf = RLHFExplainer()
print("=== RLHF 流程 ===")
for step, desc in rlhf.PIPELINE.items():
print(f" {step}: {desc}")
DPO 直接偏好优化
"""
DPO: Direct Preference Optimization
无需奖励模型,直接从偏好数据优化
"""
class DPOExplainer:
"""DPO 详解"""
OVERVIEW = {
"核心思想": "跳过奖励模型,直接用偏好数据优化策略",
"公式": "最大化 chosen 概率,最小化 rejected 概率",
"优势": "比 RLHF 简单稳定,效果接近",
}
# DPO 数据格式:同 RM 一样
DATA_FORMAT = {
"prompt": "解释什么是机器学习",
"chosen": (
"机器学习是人工智能的一个分支,"
"它使计算机能够从数据中学习规律,"
"而无需被明确地编程..."
),
"rejected": "就是让机器学习。",
}
CODE_EXAMPLE = """
from trl import DPOTrainer, DPOConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
# 加载 SFT 后的模型
model = AutoModelForCausalLM.from_pretrained("sft-output")
ref_model = AutoModelForCausalLM.from_pretrained("sft-output")
tokenizer = AutoTokenizer.from_pretrained("sft-output")
# DPO 配置
dpo_config = DPOConfig(
output_dir="./dpo-output",
num_train_epochs=1,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=5e-7, # DPO 用非常小的学习率
beta=0.1, # KL 散度系数
max_length=2048,
max_prompt_length=512,
)
# DPO 训练
dpo_trainer = DPOTrainer(
model=model,
ref_model=ref_model, # 参考模型
train_dataset=preference_data,
tokenizer=tokenizer,
args=dpo_config,
)
dpo_trainer.train()
"""
dpo = DPOExplainer()
print("=== DPO ===")
for k, v in dpo.OVERVIEW.items():
print(f" {k}: {v}")
对齐方法对比
"""
SFT vs RLHF vs DPO vs ORPO
"""
ALIGNMENT_COMPARISON = {
"方法": ["SFT", "RLHF", "DPO", "ORPO"],
"数据类型": [
"指令-回答对",
"偏好对 + RM",
"偏好对",
"偏好对",
],
"训练阶段": ["1阶段", "3阶段", "2阶段(SFT+DPO)", "1阶段"],
"复杂度": ["低", "高", "中", "低"],
"稳定性": ["高", "低", "高", "高"],
"效果": ["基础", "最好", "接近RLHF", "接近DPO"],
"推荐场景": [
"所有微调的第一步",
"顶级模型训练",
"大多数对齐需求",
"快速对齐实验",
],
}
print("=== 对齐方法对比 ===")
for i, method in enumerate(ALIGNMENT_COMPARISON["方法"]):
print(f"\n{method}:")
for key in ["数据类型", "训练阶段", "复杂度", "效果", "推荐场景"]:
print(f" {key}: {ALIGNMENT_COMPARISON[key][i]}")
本章小结
| 方法 | 阶段 | 数据需求 | 复杂度 | 效果 |
|---|---|---|---|---|
| SFT | 1 | 指令对 | ⭐ | 基础 |
| RLHF | 3 | 偏好 + RM | ⭐⭐⭐⭐⭐ | 最好 |
| DPO | 2 | 偏好对 | ⭐⭐ | 接近RLHF |
| ORPO | 1 | 偏好对 | ⭐ | 快速实验 |
推荐路径:SFT → DPO(性价比最高)
下一章:模型评估——如何科学评估微调效果。