1 min read154 words

模型对比与选择

微调了三个版本，到底用哪个？怎么系统地比较和决策？

模型选择决策流程

graph TD START[候选模型池] --> BENCH[基准测试评估] BENCH --> FILTER{达标?} FILTER -->|否| REJECT[淘汰] FILTER -->|是| TASK[任务特定评估] TASK --> COST[成本评估] COST --> LATENCY[延迟测试] LATENCY --> HUMAN[人工抽检] HUMAN --> FINAL[最终选择] BENCH -->|ROUGE / F1| B1[自动指标排名] TASK -->|领域数据| T1[业务场景测试] COST -->|GPU / API| C1[推理成本计算] style START fill:#e3f2fd,stroke:#1565c0,stroke-width:2px style FINAL fill:#c8e6c9,stroke:#388e3c,stroke-width:2px style REJECT fill:#ffcdd2,stroke:#c62828,stroke-width:2px

基准测试对比

"""
模型基准测试对比
"""
from dataclasses import dataclass, field
@dataclass
class ModelBenchmark:
"""模型对比评测"""
POPULAR_BENCHMARKS = {
"MMLU": {
"内容": "57 个学科的多选题",
"用途": "通用知识能力",
"参考线": "GPT-4: 86.4, LLaMA-3-70B: 79.5",
},
"HumanEval": {
"内容": "164 道编程题",
"用途": "代码生成能力",
"参考线": "GPT-4: 67.0, CodeLlama-34B: 48.8",
},
"MT-Bench": {
"内容": "多轮对话评分",
"用途": "对话能力",
"参考线": "GPT-4: 9.01, LLaMA-3-70B: 8.16",
},
"C-Eval": {
"内容": "中文学科多选题",
"用途": "中文能力",
"参考线": "GPT-4: 68.7, Qwen-72B: 83.3",
},
"GSM8K": {
"内容": "小学数学应用题",
"用途": "数学推理",
"参考线": "GPT-4: 92.0, LLaMA-3-70B: 93.0",
},
}
COMPARISON_TABLE = """
| 模型 | 参数量 | MMLU | HumanEval | MT-Bench | 推理速度 | 显存需求 |
|------|--------|------|-----------|----------|---------|---------|
| Qwen2.5-7B-Instruct | 7B | 74.2 | 57.3 | 7.6 | 快 | 16GB |
| LLaMA-3.1-8B-Instruct | 8B | 73.0 | 60.4 | 7.8 | 快 | 16GB |
| Mistral-7B-Instruct-v0.3 | 7B | 62.5 | 40.2 | 7.1 | 快 | 16GB |
| Qwen2.5-72B-Instruct | 72B | 85.3 | 71.2 | 8.6 | 慢 | 140GB |
| LLaMA-3.1-70B-Instruct | 70B | 82.0 | 65.4 | 8.4 | 慢 | 140GB |
"""
bm = ModelBenchmark()
print("=== 常用基准测试 ===")
for name, info in bm.POPULAR_BENCHMARKS.items():
print(f"\n{name}: {info['用途']}")
print(f"  参考线: {info['参考线']}")

任务特定评估

"""
任务特定评估：在你的业务数据上跑
"""
class TaskSpecificEval:
"""任务特定评估框架"""
def __init__(self):
self.results = {}
def evaluate_model(
self,
model_name: str,
model,
tokenizer,
test_data: list[dict],
task_type: str = "generation",
) -> dict:
"""评估单个模型"""
import time
import numpy as np
latencies = []
correct = 0
total = len(test_data)
for sample in test_data:
start = time.time()
inputs = tokenizer(
sample["instruction"],
return_tensors="pt",
).to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=256,
do_sample=False,
)
pred = tokenizer.decode(
outputs[0][inputs["input_ids"].shape[1]:],
skip_special_tokens=True,
)
latency = time.time() - start
latencies.append(latency)
# 简单匹配检查
if task_type == "classification":
if sample["output"].strip() in pred:
correct += 1
result = {
"model": model_name,
"avg_latency_ms": float(np.mean(latencies) * 1000),
"p95_latency_ms": float(
np.percentile(latencies, 95) * 1000
),
"throughput_qps": 1.0 / float(np.mean(latencies)),
}
if task_type == "classification":
result["accuracy"] = correct / total
self.results[model_name] = result
return result
def compare_models(self) -> str:
"""对比所有已评估模型"""
if not self.results:
return "没有评估结果"
lines = ["模型对比结果:", ""]
lines.append(
f"{'模型':<30} {'延迟(ms)':<12} {'P95(ms)':<12} {'QPS':<8}"
)
lines.append("-" * 62)
for name, r in sorted(
self.results.items(),
key=lambda x: x[1]["avg_latency_ms"],
):
lines.append(
f"{name:<30} {r['avg_latency_ms']:<12.1f} "
f"{r['p95_latency_ms']:<12.1f} {r['throughput_qps']:<8.2f}"
)
return "\n".join(lines)
# 使用示例
print("""
evaluator = TaskSpecificEval()
# 评估候选模型
for model_name, model, tokenizer in candidates:
evaluator.evaluate_model(model_name, model, tokenizer, test_data)
# 对比
print(evaluator.compare_models())
""")

成本分析

"""
推理成本分析
"""
class InferenceCostAnalysis:
"""推理成本计算"""
COST_TABLE = {
"API 方案": {
"GPT-4o": {
"input_per_1m": 2.5,
"output_per_1m": 10.0,
"适用": "高质量要求，低量级",
},
"GPT-4o-mini": {
"input_per_1m": 0.15,
"output_per_1m": 0.60,
"适用": "性价比之选",
},
"DeepSeek-V3": {
"input_per_1m": 0.27,
"output_per_1m": 1.10,
"适用": "中文场景性价比",
},
},
"自部署方案": {
"7B-QLoRA (A10G)": {
"gpu_hourly": 1.0,
"throughput_qps": 15,
"monthly": 720,
"适用": "中等流量",
},
"7B-GPTQ (T4)": {
"gpu_hourly": 0.5,
"throughput_qps": 8,
"monthly": 360,
"适用": "低成本起步",
},
"70B-AWQ (A100x2)": {
"gpu_hourly": 6.0,
"throughput_qps": 5,
"monthly": 4320,
"适用": "大模型高质量",
},
},
}
@staticmethod
def calculate_api_cost(
daily_requests: int,
avg_input_tokens: int,
avg_output_tokens: int,
price_per_1m_input: float,
price_per_1m_output: float,
) -> dict:
"""计算 API 月成本"""
monthly_requests = daily_requests * 30
input_cost = (
monthly_requests
* avg_input_tokens
/ 1_000_000
* price_per_1m_input
)
output_cost = (
monthly_requests
* avg_output_tokens
/ 1_000_000
* price_per_1m_output
)
return {
"月请求量": monthly_requests,
"输入成本": f"${input_cost:.2f}",
"输出成本": f"${output_cost:.2f}",
"总月成本": f"${input_cost + output_cost:.2f}",
}
cost = InferenceCostAnalysis()
# 示例：每天 1 万请求
result = cost.calculate_api_cost(
daily_requests=10000,
avg_input_tokens=500,
avg_output_tokens=200,
price_per_1m_input=0.15,
price_per_1m_output=0.60,
)
print("GPT-4o-mini 成本分析:")
for k, v in result.items():
print(f"  {k}: {v}")

模型选择决策矩阵

选择因素	7B 开源微调	70B 开源微调	GPT-4o API	GPT-4o-mini API
质量	⭐⭐⭐	⭐⭐⭐⭐⭐	⭐⭐⭐⭐⭐	⭐⭐⭐⭐
成本	⭐⭐⭐⭐⭐	⭐⭐	⭐⭐	⭐⭐⭐⭐
延迟	⭐⭐⭐⭐	⭐⭐	⭐⭐⭐	⭐⭐⭐⭐
隐私控制	⭐⭐⭐⭐⭐	⭐⭐⭐⭐⭐	⭐	⭐
定制灵活性	⭐⭐⭐⭐⭐	⭐⭐⭐⭐⭐	⭐⭐	⭐⭐
运维复杂度	高	很高	低	低

下一章：模型导出与推理优化——让模型跑得又快又省。