2 min read368 words

成本优化与推理加速

LLM 推理成本会随流量增长快速膨胀。掌握模型选择、缓存、量化、批处理等技巧，能将成本降低 50-90%。

成本优化全景

graph LR A[降低 LLM 成本] --> B[模型选择] A --> C[缓存策略] A --> D[Prompt 优化] A --> E[推理优化] B --> B1[小模型替代] B --> B2[模型路由] C --> C1[语义缓存] C --> C2[KV Cache] D --> D1[缩短 Prompt] D --> D2[结构化输出] E --> E1[量化] E --> E2[批处理] style A fill:#ffcdd2,stroke:#c62828,stroke-width:2px style B fill:#e3f2fd,stroke:#1565c0,stroke-width:2px style C fill:#fff9c4,stroke:#f9a825,stroke-width:2px style E fill:#c8e6c9,stroke:#43a047,stroke-width:2px

模型路由器

from dataclasses import dataclass
from enum import Enum
class Complexity(Enum):
SIMPLE = "simple"
MEDIUM = "medium"
COMPLEX = "complex"
@dataclass
class ModelConfig:
name: str
input_cost_per_1k: float    # $/1K tokens
output_cost_per_1k: float
max_tokens: int
latency_ms: int             # 平均延迟
# 模型定价（截至 2024）
MODELS = {
Complexity.SIMPLE: ModelConfig(
"deepseek-v3", 0.00014, 0.00028, 65536, 200
),
Complexity.MEDIUM: ModelConfig(
"gpt-4o-mini", 0.00015, 0.0006, 128000, 400
),
Complexity.COMPLEX: ModelConfig(
"claude-3.5-sonnet", 0.003, 0.015, 200000, 800
),
}
class ModelRouter:
"""根据任务复杂度自动路由到合适模型"""
COMPLEX_KEYWORDS = ["分析", "推理", "比较", "评估", "设计", "架构"]
SIMPLE_KEYWORDS = ["翻译", "摘要", "格式化", "提取", "分类"]
def classify(self, prompt: str) -> Complexity:
prompt_lower = prompt.lower()
token_estimate = len(prompt) // 2  # 粗估
# 长 prompt 通常更复杂
if token_estimate > 2000:
return Complexity.COMPLEX
complex_count = sum(
1 for kw in self.COMPLEX_KEYWORDS if kw in prompt_lower
)
simple_count = sum(
1 for kw in self.SIMPLE_KEYWORDS if kw in prompt_lower
)
if complex_count >= 2:
return Complexity.COMPLEX
if simple_count >= 1 and complex_count == 0:
return Complexity.SIMPLE
return Complexity.MEDIUM
def route(self, prompt: str) -> ModelConfig:
complexity = self.classify(prompt)
return MODELS[complexity]
def estimate_cost(
self, prompt: str, est_output_tokens: int = 500
) -> dict:
model = self.route(prompt)
input_tokens = len(prompt) // 2
input_cost = (input_tokens / 1000) * model.input_cost_per_1k
output_cost = (est_output_tokens / 1000) * model.output_cost_per_1k
return {
"model": model.name,
"input_tokens": input_tokens,
"output_tokens": est_output_tokens,
"total_cost": round(input_cost + output_cost, 6),
}

语义缓存

from dataclasses import dataclass, field
from datetime import datetime, timedelta
@dataclass
class CacheEntry:
query: str
query_vector: list[float]
response: str
created_at: datetime = field(default_factory=datetime.now)
hit_count: int = 0
class SemanticCache:
"""语义级缓存——相似问题命中同一缓存"""
def __init__(
self, similarity_threshold: float = 0.92, ttl_hours: int = 24
):
self.entries: list[CacheEntry] = []
self.threshold = similarity_threshold
self.ttl = timedelta(hours=ttl_hours)
self.stats = {"hits": 0, "misses": 0}
def get(self, query_vector: list[float]) -> str | None:
self._evict_expired()
best_match = None
best_sim = 0.0
for entry in self.entries:
sim = self._cosine_sim(query_vector, entry.query_vector)
if sim > best_sim:
best_sim = sim
best_match = entry
if best_match and best_sim >= self.threshold:
best_match.hit_count += 1
self.stats["hits"] += 1
return best_match.response
self.stats["misses"] += 1
return None
def put(
self, query: str, query_vector: list[float], response: str
) -> None:
self.entries.append(CacheEntry(query, query_vector, response))
@property
def hit_rate(self) -> float:
total = self.stats["hits"] + self.stats["misses"]
return self.stats["hits"] / total if total > 0 else 0.0
def _evict_expired(self) -> None:
now = datetime.now()
self.entries = [
e for e in self.entries if now - e.created_at < self.ttl
]
@staticmethod
def _cosine_sim(a: list[float], b: list[float]) -> float:
import math
dot = sum(x * y for x, y in zip(a, b))
ma = math.sqrt(sum(x**2 for x in a))
mb = math.sqrt(sum(x**2 for x in b))
return dot / (ma * mb) if ma and mb else 0.0

常见优化手段对比

优化手段	成本降幅	延迟影响	实现难度	质量影响
模型降级路由	40-70%	降低	中	轻微
语义缓存	30-60%	大幅降低	中	无
Prompt 压缩	20-40%	降低	低	可能降低
批量推理	30-50%	增加(吞吐提升)	低	无
量化 (INT8/INT4)	50-75%	降低	高	轻微
知识蒸馏	60-80%	降低	高	中等
KV Cache 优化	20-30%	降低	高	无

月度成本预算示例

场景	日请求量	平均input/output	模型	月成本(美元)
内部工具	1,000	500/200 tokens	DeepSeek-V3	~$3
客服Bot	10,000	800/300 tokens	GPT-4o-mini	~$45
内容生成	5,000	1000/2000 tokens	Claude-3.5-Sonnet	~$2,700
同上+路由优化	5,000	1000/2000 tokens	混合路由	~$600

本章小结

模型路由是性价比最高的优化——70% 请求可用小模型处理
语义缓存降低重复成本——相似问题直接返回缓存结果
量化适合自部署——INT8 几乎无质量损失，成本减半
先监控再优化——用日志分析哪些请求消耗最高
设置预算告警——避免意外的 API 账单

下一章：开源模型生态