微调方法详解
全量微调、LoRA、QLoRA——各有长短,选对方法事半功倍。
微调方法全景
graph LR
FT[微调方法] --> FULL[全量微调]
FT --> PEFT[参数高效微调 PEFT]
FULL --> F1[更新所有参数]
FULL --> F2[效果最好]
FULL --> F3[成本最高]
PEFT --> LORA[LoRA]
PEFT --> QLORA[QLoRA]
PEFT --> ADAPTER[Adapter]
PEFT --> PREFIX[Prefix Tuning]
LORA --> L1[低秩矩阵分解]
LORA --> L2[只训练 0.1-1% 参数]
QLORA --> Q1[4-bit 量化 + LoRA]
QLORA --> Q2[单卡训练 70B 模型]
style LORA fill:#c8e6c9,stroke:#388e3c,stroke-width:2px
style QLORA fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
全量微调
"""
全量微调:更新模型所有参数
- 效果最好但成本最高
- 适合大规模数据 + 充足 GPU
"""
class FullFineTuning:
"""全量微调说明"""
OVERVIEW = {
"原理": "反向传播更新模型所有权重参数",
"参数量": "100%(7B=70亿参数全部更新)",
"显存需求": "模型参数 + 梯度 + 优化器状态",
"典型配置": {
"7B 模型": "2x A100 80GB (FP16)",
"13B 模型": "4x A100 80GB (FP16)",
"70B 模型": "8x A100 80GB (FP16) + DeepSpeed",
},
}
# 显存计算公式
MEMORY_FORMULA = """
显存 ≈ 模型参数 × (
2 bytes (FP16 权重)
+ 2 bytes (FP16 梯度)
+ 8 bytes (AdamW 优化器: momentum + variance + master weights)
) = 参数量 × 12 bytes
例:7B 模型 = 7×10⁹ × 12 = 84 GB
→ 需要 2x A100 80GB (含激活值开销)
"""
TRAINING_CODE = """
from transformers import (
AutoModelForCausalLM, AutoTokenizer,
TrainingArguments, Trainer
)
from datasets import load_dataset
# 1. 加载模型和分词器
model_name = "meta-llama/Llama-3.1-8B"
model = AutoModelForCausalLM.from_pretrained(
model_name, torch_dtype="float16"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# 2. 准备数据
dataset = load_dataset("json", data_files="train_data.jsonl")
def tokenize(example):
return tokenizer(
example["text"],
truncation=True,
max_length=2048,
padding="max_length",
)
tokenized = dataset.map(tokenize, batched=True)
# 3. 训练配置
training_args = TrainingArguments(
output_dir="./output",
num_train_epochs=3,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-5,
weight_decay=0.01,
warmup_ratio=0.1,
fp16=True,
logging_steps=10,
save_strategy="epoch",
deepspeed="ds_config.json", # 多卡必须
)
# 4. 训练
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized["train"],
)
trainer.train()
"""
ft = FullFineTuning()
print("=== 全量微调 ===")
print(f"参数量: {ft.OVERVIEW['参数量']}")
print(f"\n显存需求:")
for model, gpu in ft.OVERVIEW["典型配置"].items():
print(f" {model}: {gpu}")
LoRA:低秩适应
"""
LoRA (Low-Rank Adaptation)
核心思想:冻结原始权重,只训练低秩增量矩阵
W' = W + ΔW = W + A × B
其中 A ∈ R^(d×r), B ∈ R^(r×d), r << d
"""
class LoRAExplainer:
"""LoRA 详解"""
PRINCIPLE = {
"核心公式": "W' = W + A × B",
"W": "原始权重矩阵(冻结不动)",
"A": "下投影矩阵 (d × r),随机初始化",
"B": "上投影矩阵 (r × d),初始化为零",
"r (rank)": "低秩维度,通常 8-64",
"参数量": "只训练 A 和 B,约原始 0.1-1%",
}
HYPERPARAMS = {
"r (rank)": {
"范围": "4-128",
"推荐": "16-64",
"说明": "越大能力越强,但参数越多",
},
"lora_alpha": {
"范围": "8-128",
"推荐": "等于 r 或 2×r",
"说明": "缩放因子 alpha/r 控制学习幅度",
},
"target_modules": {
"范围": "attention 层的 q/k/v/o",
"推荐": '["q_proj", "v_proj"]',
"说明": "应用 LoRA 的目标层",
},
"lora_dropout": {
"范围": "0.0-0.1",
"推荐": "0.05",
"说明": "正则化,防止过拟合",
},
}
TRAINING_CODE = """
from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoModelForCausalLM
# 1. 加载基座模型
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-8B",
torch_dtype="float16",
device_map="auto",
)
# 2. 配置 LoRA
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=16, # 秩
lora_alpha=32, # 缩放因子
lora_dropout=0.05, # Dropout
target_modules=[ # 目标层
"q_proj", "k_proj",
"v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
bias="none",
)
# 3. 应用 LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# trainable params: 13.6M || all params: 8.03B || trainable%: 0.17%
# 4. 训练(同全量微调一样用 Trainer)
# ... Trainer 配置同上
# 5. 保存 LoRA 权重(只有几十MB)
model.save_pretrained("lora-output")
# 6. 推理时合并
from peft import PeftModel
base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B")
merged = PeftModel.from_pretrained(base_model, "lora-output")
merged = merged.merge_and_unload() # 合并回基座模型
"""
lora = LoRAExplainer()
print("=== LoRA ===")
for k, v in lora.PRINCIPLE.items():
print(f" {k}: {v}")
print("\n超参数:")
for name, info in lora.HYPERPARAMS.items():
print(f" {name}: 推荐 {info['推荐']} ({info['说明']})")
QLoRA:量化 + LoRA
"""
QLoRA: 4-bit 量化基座 + LoRA 训练
单卡 A100 即可微调 70B 模型
"""
class QLoRAExplainer:
"""QLoRA 详解"""
PRINCIPLE = {
"核心": "用 4-bit 量化加载基座模型,再用 LoRA 训练",
"关键技术": [
"4-bit NormalFloat (NF4) 量化",
"双量化 (Double Quantization)",
"分页优化器 (Paged Optimizers)",
],
"显存": "7B → 6 GB, 13B → 10 GB, 70B → 40 GB",
}
TRAINING_CODE = """
from transformers import (
AutoModelForCausalLM, BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
# 1. 4-bit 量化配置
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4", # NF4 量化
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True, # 双量化
)
# 2. 加载量化模型
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-70B",
quantization_config=bnb_config,
device_map="auto",
)
# 3. 准备 k-bit 训练
model = prepare_model_for_kbit_training(model)
# 4. LoRA 配置(同上)
lora_config = LoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.05,
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# 70B 模型也只需训练 ~20M 参数
"""
qlora = QLoRAExplainer()
print("=== QLoRA ===")
print(f"核心: {qlora.PRINCIPLE['核心']}")
print(f"显存: {qlora.PRINCIPLE['显存']}")
方法对比
"""
微调方法全面对比
"""
METHOD_COMPARISON = {
"方法": ["全量微调", "LoRA", "QLoRA", "Adapter", "Prefix Tuning"],
"训练参数": ["100%", "0.1-1%", "0.1-1%", "1-5%", "0.1%"],
"显存 (7B)": ["84 GB", "16 GB", "6 GB", "20 GB", "14 GB"],
"训练速度": ["慢", "快", "中", "中", "快"],
"效果": ["最好", "接近全量", "略低于LoRA", "中等", "较弱"],
"部署": ["完整模型", "基座+适配器", "基座+适配器", "基座+适配器", "基座+前缀"],
"适用场景": [
"充足GPU+大数据",
"通用推荐方案",
"单卡大模型",
"多任务切换",
"生成控制",
],
}
print("=== 微调方法对比 ===")
for i, method in enumerate(METHOD_COMPARISON["方法"]):
print(f"\n{method}:")
for key in ["训练参数", "显存 (7B)", "效果", "适用场景"]:
print(f" {key}: {METHOD_COMPARISON[key][i]}")
本章小结
| 方法 | 参数量 | 显存(7B) | 效果 | 推荐场景 |
|---|---|---|---|---|
| 全量微调 | 100% | 84 GB | ⭐⭐⭐⭐⭐ | 大数据 + 多GPU |
| LoRA | 0.1-1% | 16 GB | ⭐⭐⭐⭐ | 通用推荐 |
| QLoRA | 0.1-1% | 6 GB | ⭐⭐⭐⭐ | 单卡/资源有限 |
| Adapter | 1-5% | 20 GB | ⭐⭐⭐ | 多任务切换 |
| Prefix | 0.1% | 14 GB | ⭐⭐⭐ | 简单生成控制 |
下一章:数据准备——高质量训练数据的收集、清洗与格式化。