成本优化策略
LLM API 成本很容易失控。一个日均 10 万请求的服务,每月成本可能从 $300 到 $30,000 不等,取决于你的优化策略。
成本模型
graph TB
A[LLM 成本] --> B[输入成本]
A --> C[输出成本]
A --> D[基础设施成本]
B --> B1[System Prompt]
B --> B2[用户输入]
B --> B3[上下文/历史]
C --> C1[生成长度]
C --> C2[温度/采样]
D --> D1[GPU/服务器]
D --> D2[存储]
D --> D3[网络]
style A fill:#ffcdd2,stroke:#c62828,stroke-width:2px
style B fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style C fill:#fff3e0,stroke:#f57c00,stroke-width:2px
成本计算器
"""
LLM 成本计算与分析
"""
from dataclasses import dataclass
@dataclass
class ModelPricing:
"""模型定价"""
name: str
input_per_1m: float # 每百万 input token 价格
output_per_1m: float # 每百万 output token 价格
context_window: int # 上下文窗口
class CostCalculator:
"""成本计算器"""
MODELS = {
"gpt-4o": ModelPricing("GPT-4o", 2.50, 10.00, 128000),
"gpt-4o-mini": ModelPricing("GPT-4o Mini", 0.15, 0.60, 128000),
"claude-3.5-sonnet": ModelPricing("Claude 3.5 Sonnet", 3.00, 15.00, 200000),
"claude-3.5-haiku": ModelPricing("Claude 3.5 Haiku", 0.80, 4.00, 200000),
"deepseek-v3": ModelPricing("DeepSeek V3", 0.27, 1.10, 64000),
"llama-3.1-8b": ModelPricing("Llama 3.1 8B (自托管)", 0.0, 0.0, 128000),
}
def estimate_monthly_cost(
self,
model_key: str,
daily_requests: int,
avg_input_tokens: int = 500,
avg_output_tokens: int = 200,
) -> dict:
"""估算月度成本"""
model = self.MODELS.get(model_key)
if not model:
return {"error": f"未知模型: {model_key}"}
monthly_requests = daily_requests * 30
input_cost = (
monthly_requests * avg_input_tokens / 1_000_000
* model.input_per_1m
)
output_cost = (
monthly_requests * avg_output_tokens / 1_000_000
* model.output_per_1m
)
total = input_cost + output_cost
return {
"model": model.name,
"daily_requests": daily_requests,
"monthly_requests": monthly_requests,
"input_cost": round(input_cost, 2),
"output_cost": round(output_cost, 2),
"total_monthly": round(total, 2),
"cost_per_request": round(total / monthly_requests, 6),
}
def compare_models(
self,
daily_requests: int,
avg_input_tokens: int = 500,
avg_output_tokens: int = 200,
) -> None:
"""对比所有模型成本"""
print(f"\n=== 模型成本对比 (日均 {daily_requests:,} 请求) ===")
print(f"{'模型':<25} {'月成本':>12} {'单次成本':>12}")
print("-" * 50)
results = []
for key in self.MODELS:
r = self.estimate_monthly_cost(
key, daily_requests, avg_input_tokens, avg_output_tokens
)
results.append(r)
results.sort(key=lambda x: x["total_monthly"])
for r in results:
print(
f" {r['model']:<22} "
f"${r['total_monthly']:>10,.2f} "
f"${r['cost_per_request']:>10.6f}"
)
if len(results) >= 2:
cheapest = results[0]
expensive = results[-1]
savings = expensive["total_monthly"] - cheapest["total_monthly"]
print(f"\n 最大节省: ${savings:,.2f}/月 "
f"({cheapest['model']} vs {expensive['model']})")
# 使用
calc = CostCalculator()
calc.compare_models(daily_requests=100_000)
模型路由降本
"""
智能模型路由 - 按复杂度选择模型
"""
class SmartRouter:
"""按请求复杂度路由到不同模型"""
def __init__(self):
self.routing_stats = {"cheap": 0, "mid": 0, "expensive": 0}
def classify_complexity(self, query: str) -> str:
"""
分类请求复杂度
简单 → 便宜模型 (GPT-4o-mini / Haiku)
中等 → 中端模型 (GPT-4o / Sonnet)
复杂 → 高端模型 (GPT-4o / Opus)
"""
# 简单分类规则(实际中可用分类模型)
simple_patterns = [
"你好", "谢谢", "是的", "对的",
"帮我翻译", "总结一下",
]
complex_patterns = [
"分析", "设计", "架构", "比较",
"代码审查", "优化", "调试",
]
query_lower = query.lower()
if any(p in query_lower for p in simple_patterns):
return "simple"
elif any(p in query_lower for p in complex_patterns):
return "complex"
else:
return "medium"
def route(self, query: str) -> dict:
"""路由请求"""
complexity = self.classify_complexity(query)
model_map = {
"simple": {
"model": "gpt-4o-mini",
"max_tokens": 256,
"temperature": 0.3,
},
"medium": {
"model": "gpt-4o-mini",
"max_tokens": 1024,
"temperature": 0.7,
},
"complex": {
"model": "gpt-4o",
"max_tokens": 4096,
"temperature": 0.7,
},
}
tier = {"simple": "cheap", "medium": "mid", "complex": "expensive"}
self.routing_stats[tier[complexity]] += 1
config = model_map[complexity]
config["complexity"] = complexity
return config
def print_stats(self) -> None:
total = sum(self.routing_stats.values())
if total == 0:
return
print("\n=== 路由统计 ===")
for tier, count in self.routing_stats.items():
pct = count / total * 100
print(f" {tier}: {count} ({pct:.0f}%)")
# 使用
router = SmartRouter()
queries = [
"你好",
"帮我翻译这句话",
"分析这段代码的性能瓶颈并给出优化方案",
"Python是什么?",
"设计一个高可用的微服务架构",
"谢谢",
"比较 PostgreSQL 和 MongoDB 的优劣",
]
for q in queries:
config = router.route(q)
print(f" [{config['complexity']:>7}] {q[:30]:30s} → {config['model']}")
router.print_stats()
成本监控与告警
"""
成本监控与预警
"""
import time
from collections import defaultdict
class CostMonitor:
"""成本监控器"""
def __init__(
self,
daily_budget: float = 100.0,
alert_threshold: float = 0.8,
):
self.daily_budget = daily_budget
self.alert_threshold = alert_threshold
self.daily_cost: float = 0.0
self.hourly_costs: defaultdict = defaultdict(float)
self.model_costs: defaultdict = defaultdict(float)
self.reset_time: float = time.time()
def record_cost(self, cost: float, model: str) -> list[str]:
"""记录成本"""
self.daily_cost += cost
hour = time.strftime("%H")
self.hourly_costs[hour] += cost
self.model_costs[model] += cost
return self._check_alerts()
def _check_alerts(self) -> list[str]:
"""检查成本告警"""
alerts = []
# 日预算告警
usage_pct = self.daily_cost / self.daily_budget
if usage_pct >= 1.0:
alerts.append(
f"🚨 日预算超标!当前: ${self.daily_cost:.2f} / "
f"${self.daily_budget:.2f}"
)
elif usage_pct >= self.alert_threshold:
alerts.append(
f"⚠️ 日预算已用 {usage_pct:.0%}:${self.daily_cost:.2f} / "
f"${self.daily_budget:.2f}"
)
# 小时成本异常
hour = time.strftime("%H")
hourly_avg = self.daily_budget / 24
if self.hourly_costs[hour] > hourly_avg * 3:
alerts.append(
f"⚠️ 当前小时成本异常: ${self.hourly_costs[hour]:.2f} "
f"(均值: ${hourly_avg:.2f})"
)
return alerts
def get_report(self) -> dict:
"""获取成本报告"""
return {
"daily_cost": round(self.daily_cost, 2),
"daily_budget": self.daily_budget,
"usage_pct": f"{self.daily_cost / self.daily_budget:.0%}",
"model_breakdown": {
k: round(v, 4) for k, v in self.model_costs.items()
},
"top_hour": max(
self.hourly_costs.items(),
key=lambda x: x[1],
default=("N/A", 0),
),
}
def print_report(self) -> None:
"""打印成本报告"""
r = self.get_report()
print("\n=== 成本报告 ===")
print(f" 日成本: ${r['daily_cost']} / ${r['daily_budget']} ({r['usage_pct']})")
print(" 模型细分:")
for model, cost in r["model_breakdown"].items():
print(f" {model}: ${cost}")
# 使用
monitor = CostMonitor(daily_budget=50.0)
# 模拟
import random
for _ in range(200):
model = random.choice(["gpt-4o", "gpt-4o-mini", "gpt-4o-mini"])
cost = 0.05 if model == "gpt-4o" else 0.002
alerts = monitor.record_cost(cost, model)
for a in alerts:
print(a)
monitor.print_report()
降本策略总结
| 策略 | 预期节省 | 实施难度 | 风险 |
|---|---|---|---|
| 用小模型替代大模型 | 50-90% | 低 | 质量下降 |
| 语义缓存 | 30-60% | 中 | 缓存过期 |
| Prompt 压缩 | 10-30% | 低 | 信息丢失 |
| 智能路由 | 30-50% | 中 | 分类错误 |
| 限制 max_tokens | 10-20% | 低 | 回答不完整 |
| 批处理 | 20-40% | 中 | 延迟增加 |
| 开源自托管 | 70-95% | 高 | 运维成本 |
黄金法则:先用小模型尝试,不行再升级。80% 的请求用 GPT-4o-mini 就够了。
下一章:监控与可观测性。