2 min read471 words

LLM 生产案例集

从真实生产事故和优化实践中提炼的经验教训，覆盖常见的 LLM 生产场景。

案例概览

graph TB A[生产案例] --> B[智能客服] A --> C[代码助手] A --> D[内容审核] A --> E[数据分析] B --> B1[降本 70%
缓存 + 模型分级] C --> C1[延迟 P99 < 3s
流式 + 预热] D --> D1[准确率 99.5%
双模型交叉验证] E --> E1[成本可控
Token 预算管理] style A fill:#e3f2fd,stroke:#1976d2,stroke-width:3px

案例一：智能客服降本 70%

背景

日均 50 万次对话，全部使用 GPT-4，月成本 $15,000+
80% 为常见问题（退换货、查订单、改地址）
用户对延迟敏感，P95 需 < 2s

解决方案

"""
分级路由客服系统
"""
from dataclasses import dataclass
from enum import Enum
class Complexity(Enum):
SIMPLE = "simple"     # FAQ 直接回答
MEDIUM = "medium"     # 需要查询上下文
COMPLEX = "complex"   # 需要高级推理
@dataclass
class RoutingResult:
complexity: Complexity
model: str
use_cache: bool
estimated_cost: float
class CustomerServiceRouter:
"""客服请求路由"""
MODEL_CONFIG = {
Complexity.SIMPLE: {"model": "gpt-3.5-turbo", "cost_per_1k": 0.0005},
Complexity.MEDIUM: {"model": "gpt-4o-mini", "cost_per_1k": 0.00015},
Complexity.COMPLEX: {"model": "gpt-4o", "cost_per_1k": 0.005},
}
FAQ_KEYWORDS = {
"退货", "退换", "退款", "查订单", "改地址", "发票",
"配送", "物流", "快递", "价格", "优惠",
}
def route(self, query: str, history_len: int = 0) -> RoutingResult:
"""路由决策"""
complexity = self._classify(query, history_len)
config = self.MODEL_CONFIG[complexity]
return RoutingResult(
complexity=complexity,
model=config["model"],
use_cache=(complexity == Complexity.SIMPLE),
estimated_cost=config["cost_per_1k"],
)
def _classify(self, query: str, history_len: int) -> Complexity:
"""分类查询复杂度"""
if any(kw in query for kw in self.FAQ_KEYWORDS) and history_len < 3:
return Complexity.SIMPLE
if history_len < 5:
return Complexity.MEDIUM
return Complexity.COMPLEX

效果

指标	优化前	优化后
月成本	$15,000	$4,500
P95 延迟	3.2s	1.8s
用户满意度	4.1/5	4.3/5
缓存命中率	0%	42%

案例二：代码助手性能优化

背景

IDE 内代码补全，延迟极度敏感
P99 延迟 > 5s 导致用户放弃使用
需支持多语言、多框架

关键优化

"""
代码助手性能优化策略
"""
from dataclasses import dataclass, field
import time
@dataclass
class CompletionConfig:
"""补全配置"""
max_context_tokens: int = 2048
max_completion_tokens: int = 256
stream: bool = True
timeout_ms: float = 3000
# 预热配置
prefetch_on_file_open: bool = True
cache_file_context: bool = True
class CodeCompletionOptimizer:
"""代码补全优化器"""
def __init__(self, config: CompletionConfig):
self.config = config
self._context_cache: dict[str, str] = {}
self._stats = {"total": 0, "cache_hits": 0, "timeouts": 0}
def get_completion(self, file_path: str, cursor_pos: int, file_content: str) -> str:
"""获取代码补全"""
self._stats["total"] += 1
start = time.time()
# 构建精简上下文
context = self._build_context(file_path, cursor_pos, file_content)
# 检查缓存
cache_key = f"{file_path}:{cursor_pos}:{hash(context)}"
if cache_key in self._context_cache:
self._stats["cache_hits"] += 1
return self._context_cache[cache_key]
# 调用模型（带超时）
result = self._call_with_timeout(context)
elapsed = (time.time() - start) * 1000
if elapsed > self.config.timeout_ms:
self._stats["timeouts"] += 1
if result:
self._context_cache[cache_key] = result
return result or ""
def _build_context(self, file_path: str, cursor_pos: int, content: str) -> str:
"""构建精简上下文 — 只取光标附近"""
lines = content.split("\n")
# 光标前 50 行 + 后 10 行
line_num = content[:cursor_pos].count("\n")
start = max(0, line_num - 50)
end = min(len(lines), line_num + 10)
context_lines = lines[start:end]
return "\n".join(context_lines)
def _call_with_timeout(self, context: str) -> str | None:
"""带超时的模型调用（示意）"""
return f"# completion for context of length {len(context)}"
def get_stats(self) -> dict:
total = self._stats["total"]
return {
**self._stats,
"cache_hit_rate": (
round(self._stats["cache_hits"] / total, 3) if total else 0
),
"timeout_rate": (
round(self._stats["timeouts"] / total, 3) if total else 0
),
}

案例三：内容审核双模型策略

架构

graph LR A[用户内容] --> B[快速模型
GPT-3.5] B --> C{风险评分} C -->|低风险<90%| D[直接放行] C -->|中风险| E[高级模型
GPT-4o] C -->|高风险>95%| F[直接拦截] E --> G{二次判定} G -->|安全| D G -->|违规| F style A fill:#e3f2fd,stroke:#1976d2,stroke-width:3px style F fill:#ffebee,stroke:#c62828,stroke-width:2px

效果

指标	单模型	双模型
准确率	97.2%	99.5%
成本	$8,000/月	$3,200/月
平均延迟	1.5s	0.8s（90% 快速通过）
误拦率	2.1%	0.3%

案例四：Token 预算管理

"""
Token 预算管理器
"""
from dataclasses import dataclass, field
import time
@dataclass
class Budget:
"""Token 预算"""
daily_limit: int
used: int = 0
reset_time: float = field(default_factory=lambda: time.time() + 86400)
class TokenBudgetManager:
"""Token 预算管理"""
def __init__(self):
self._budgets: dict[str, Budget] = {}
def set_budget(self, project: str, daily_limit: int):
"""设置预算"""
self._budgets[project] = Budget(daily_limit=daily_limit)
def check_budget(self, project: str, estimated_tokens: int) -> bool:
"""检查是否超预算"""
budget = self._budgets.get(project)
if not budget:
return True
# 自动重置
if time.time() > budget.reset_time:
budget.used = 0
budget.reset_time = time.time() + 86400
return (budget.used + estimated_tokens) <= budget.daily_limit
def consume(self, project: str, tokens: int):
"""消耗 Token"""
budget = self._budgets.get(project)
if budget:
budget.used += tokens
def get_usage(self, project: str) -> dict:
"""获取用量"""
budget = self._budgets.get(project)
if not budget:
return {"error": "no budget set"}
return {
"daily_limit": budget.daily_limit,
"used": budget.used,
"remaining": budget.daily_limit - budget.used,
"usage_pct": round(budget.used / budget.daily_limit * 100, 1),
}

生产经验总结

经验	说明
分级路由	80% 请求可用便宜模型，节省 60-70% 成本
语义缓存	相似问题缓存命中可达 30-50%
流式响应	TTFT 比完整延迟更影响体验
双模型验证	安全场景用快+慢双模型，降成本提准确率
Token 预算	按项目/用户设上限，防止意外支出
降级策略	主模型不可用时自动切备用模型

本章小结

生产环境没有银弹——每个场景需分析流量特征、延迟要求和成本约束，选择适合的组合策略。关键原则：先用便宜的模型兜底，复杂场景才上大模型。

下一章：多模态模型部署