成本优化与推理加速
LLM 推理成本会随流量增长快速膨胀。掌握模型选择、缓存、量化、批处理等技巧,能将成本降低 50-90%。
成本优化全景
graph LR
A[降低 LLM 成本] --> B[模型选择]
A --> C[缓存策略]
A --> D[Prompt 优化]
A --> E[推理优化]
B --> B1[小模型替代]
B --> B2[模型路由]
C --> C1[语义缓存]
C --> C2[KV Cache]
D --> D1[缩短 Prompt]
D --> D2[结构化输出]
E --> E1[量化]
E --> E2[批处理]
style A fill:#ffcdd2,stroke:#c62828,stroke-width:2px
style B fill:#e3f2fd,stroke:#1565c0,stroke-width:2px
style C fill:#fff9c4,stroke:#f9a825,stroke-width:2px
style E fill:#c8e6c9,stroke:#43a047,stroke-width:2px
模型路由器
from dataclasses import dataclass
from enum import Enum
class Complexity(Enum):
SIMPLE = "simple"
MEDIUM = "medium"
COMPLEX = "complex"
@dataclass
class ModelConfig:
name: str
input_cost_per_1k: float # $/1K tokens
output_cost_per_1k: float
max_tokens: int
latency_ms: int # 平均延迟
# 模型定价(截至 2024)
MODELS = {
Complexity.SIMPLE: ModelConfig(
"deepseek-v3", 0.00014, 0.00028, 65536, 200
),
Complexity.MEDIUM: ModelConfig(
"gpt-4o-mini", 0.00015, 0.0006, 128000, 400
),
Complexity.COMPLEX: ModelConfig(
"claude-3.5-sonnet", 0.003, 0.015, 200000, 800
),
}
class ModelRouter:
"""根据任务复杂度自动路由到合适模型"""
COMPLEX_KEYWORDS = ["分析", "推理", "比较", "评估", "设计", "架构"]
SIMPLE_KEYWORDS = ["翻译", "摘要", "格式化", "提取", "分类"]
def classify(self, prompt: str) -> Complexity:
prompt_lower = prompt.lower()
token_estimate = len(prompt) // 2 # 粗估
# 长 prompt 通常更复杂
if token_estimate > 2000:
return Complexity.COMPLEX
complex_count = sum(
1 for kw in self.COMPLEX_KEYWORDS if kw in prompt_lower
)
simple_count = sum(
1 for kw in self.SIMPLE_KEYWORDS if kw in prompt_lower
)
if complex_count >= 2:
return Complexity.COMPLEX
if simple_count >= 1 and complex_count == 0:
return Complexity.SIMPLE
return Complexity.MEDIUM
def route(self, prompt: str) -> ModelConfig:
complexity = self.classify(prompt)
return MODELS[complexity]
def estimate_cost(
self, prompt: str, est_output_tokens: int = 500
) -> dict:
model = self.route(prompt)
input_tokens = len(prompt) // 2
input_cost = (input_tokens / 1000) * model.input_cost_per_1k
output_cost = (est_output_tokens / 1000) * model.output_cost_per_1k
return {
"model": model.name,
"input_tokens": input_tokens,
"output_tokens": est_output_tokens,
"total_cost": round(input_cost + output_cost, 6),
}
语义缓存
from dataclasses import dataclass, field
from datetime import datetime, timedelta
@dataclass
class CacheEntry:
query: str
query_vector: list[float]
response: str
created_at: datetime = field(default_factory=datetime.now)
hit_count: int = 0
class SemanticCache:
"""语义级缓存——相似问题命中同一缓存"""
def __init__(
self, similarity_threshold: float = 0.92, ttl_hours: int = 24
):
self.entries: list[CacheEntry] = []
self.threshold = similarity_threshold
self.ttl = timedelta(hours=ttl_hours)
self.stats = {"hits": 0, "misses": 0}
def get(self, query_vector: list[float]) -> str | None:
self._evict_expired()
best_match = None
best_sim = 0.0
for entry in self.entries:
sim = self._cosine_sim(query_vector, entry.query_vector)
if sim > best_sim:
best_sim = sim
best_match = entry
if best_match and best_sim >= self.threshold:
best_match.hit_count += 1
self.stats["hits"] += 1
return best_match.response
self.stats["misses"] += 1
return None
def put(
self, query: str, query_vector: list[float], response: str
) -> None:
self.entries.append(CacheEntry(query, query_vector, response))
@property
def hit_rate(self) -> float:
total = self.stats["hits"] + self.stats["misses"]
return self.stats["hits"] / total if total > 0 else 0.0
def _evict_expired(self) -> None:
now = datetime.now()
self.entries = [
e for e in self.entries if now - e.created_at < self.ttl
]
@staticmethod
def _cosine_sim(a: list[float], b: list[float]) -> float:
import math
dot = sum(x * y for x, y in zip(a, b))
ma = math.sqrt(sum(x**2 for x in a))
mb = math.sqrt(sum(x**2 for x in b))
return dot / (ma * mb) if ma and mb else 0.0
常见优化手段对比
| 优化手段 | 成本降幅 | 延迟影响 | 实现难度 | 质量影响 |
|---|---|---|---|---|
| 模型降级路由 | 40-70% | 降低 | 中 | 轻微 |
| 语义缓存 | 30-60% | 大幅降低 | 中 | 无 |
| Prompt 压缩 | 20-40% | 降低 | 低 | 可能降低 |
| 批量推理 | 30-50% | 增加(吞吐提升) | 低 | 无 |
| 量化 (INT8/INT4) | 50-75% | 降低 | 高 | 轻微 |
| 知识蒸馏 | 60-80% | 降低 | 高 | 中等 |
| KV Cache 优化 | 20-30% | 降低 | 高 | 无 |
月度成本预算示例
| 场景 | 日请求量 | 平均input/output | 模型 | 月成本(美元) |
|---|---|---|---|---|
| 内部工具 | 1,000 | 500/200 tokens | DeepSeek-V3 | ~$3 |
| 客服Bot | 10,000 | 800/300 tokens | GPT-4o-mini | ~$45 |
| 内容生成 | 5,000 | 1000/2000 tokens | Claude-3.5-Sonnet | ~$2,700 |
| 同上+路由优化 | 5,000 | 1000/2000 tokens | 混合路由 | ~$600 |
本章小结
- 模型路由是性价比最高的优化——70% 请求可用小模型处理
- 语义缓存降低重复成本——相似问题直接返回缓存结果
- 量化适合自部署——INT8 几乎无质量损失,成本减半
- 先监控再优化——用日志分析哪些请求消耗最高
- 设置预算告警——避免意外的 API 账单
下一章:开源模型生态