从 Demo 到生产
High Contrast
Dark Mode
Light Mode
Sepia
Forest
1 min read213 words

从 Demo 到生产

97% 的 LLM Demo 都无法直接上线。Demo 和生产系统之间有一条巨大的鸿沟。

The Demo Trap

graph LR A[Demo] -->|"看起来很容易"| B[生产] B --> C[延迟问题] B --> D[成本爆炸] B --> E[稳定性差] B --> F[安全风险] B --> G[监控缺失] style A fill:#c8e6c9,stroke:#388e3c,stroke-width:2px style B fill:#ffcdd2,stroke:#c62828,stroke-width:2px
维度 Demo 生产系统
流量 1-10 QPS 100-10000 QPS
延迟 无所谓 P99 < 2s
可用性 偶尔挂没事 99.9% SLA
成本 刷卡就行 月预算 $10K 封顶
安全 没想过 必须考虑注入、数据泄露
监控 print 日志 全链路追踪

生产环境的核心挑战

"""
生产环境 vs Demo 的差异
"""
from dataclasses import dataclass
@dataclass
class ProductionRequirements:
"""生产环境非功能需求"""
# 性能
latency_p50_ms: int = 500
latency_p99_ms: int = 2000
throughput_qps: int = 100
# 可用性
sla_percentage: float = 99.9
max_downtime_per_month_minutes: float = 43.8  # 99.9% SLA
# 成本
monthly_budget_usd: float = 10000
cost_per_request_max_usd: float = 0.01
# 安全
require_encryption: bool = True
require_auth: bool = True
require_audit_log: bool = True
data_retention_days: int = 90
def print_requirements(self):
print("=== 生产环境需求清单 ===")
print(f"  延迟 P50: {self.latency_p50_ms}ms")
print(f"  延迟 P99: {self.latency_p99_ms}ms")
print(f"  吞吐量: {self.throughput_qps} QPS")
print(f"  SLA: {self.sla_percentage}%")
print(f"  月预算: ${self.monthly_budget_usd:,.0f}")
print(f"  单次成本上限: ${self.cost_per_request_max_usd}")
# 定义需求
prod_req = ProductionRequirements()
prod_req.print_requirements()

技术栈选型

"""
生产级 LLM 技术栈
"""
PRODUCTION_STACK = {
"API 框架": {
"推荐": "FastAPI",
"原因": "异步、高性能、OpenAPI 文档",
"替代": ["Flask + Gunicorn", "Django Ninja"],
},
"任务队列": {
"推荐": "Celery + Redis",
"原因": "异步处理长任务、重试机制",
"替代": ["RQ", "Dramatiq"],
},
"缓存": {
"推荐": "Redis",
"原因": "语义缓存、会话存储、限流",
"替代": ["Memcached", "本地 LRU"],
},
"数据库": {
"推荐": "PostgreSQL + pgvector",
"原因": "向量搜索 + 关系数据一体化",
"替代": ["MongoDB", "单独的向量数据库"],
},
"监控": {
"推荐": "Prometheus + Grafana",
"原因": "指标收集 + 可视化 + 告警",
"替代": ["Datadog", "New Relic"],
},
"日志": {
"推荐": "结构化日志 + ELK",
"原因": "可搜索、可分析",
"替代": ["Loki + Grafana", "CloudWatch"],
},
"部署": {
"推荐": "Docker + Kubernetes",
"原因": "弹性伸缩、滚动更新",
"替代": ["Docker Compose", "Cloud Run"],
},
}
for component, info in PRODUCTION_STACK.items():
print(f"\n{component}:")
print(f"  推荐: {info['推荐']}")
print(f"  原因: {info['原因']}")

生产就绪清单

"""
Production Readiness Checklist
"""
class ReadinessChecker:
"""生产就绪检查器"""
CHECKLIST = {
"API 层": [
"认证与鉴权",
"请求频率限制",
"输入验证与清洗",
"错误处理(不暴露内部错误)",
"API 版本管理",
"OpenAPI 文档",
],
"LLM 层": [
"Prompt 版本管理",
"多模型 Fallback",
"流式响应",
"Token 限制",
"输出过滤",
],
"数据层": [
"数据加密(传输+存储)",
"数据备份策略",
"PII 脱敏",
"审计日志",
],
"运维层": [
"健康检查端点",
"指标监控",
"告警规则",
"日志聚合",
"自动扩缩容",
],
"安全层": [
"Prompt 注入防护",
"输出安全过滤",
"DDoS 防护",
"密钥管理",
],
}
def check(self, implemented: dict[str, list[str]]) -> None:
"""检查生产就绪状态"""
total = 0
passed = 0
for category, items in self.CHECKLIST.items():
done = implemented.get(category, [])
cat_total = len(items)
cat_done = len(set(items) & set(done))
total += cat_total
passed += cat_done
status = "✅" if cat_done == cat_total else "⚠️"
print(f"\n{status} {category} ({cat_done}/{cat_total})")
for item in items:
check = "✓" if item in done else "✗"
print(f"  [{check}] {item}")
pct = passed / total * 100
print(f"\n总体就绪度: {passed}/{total} ({pct:.0f}%)")
if pct >= 90:
print("🟢 可以上线")
elif pct >= 70:
print("🟡 需要补齐关键项")
else:
print("🔴 尚未就绪")
checker = ReadinessChecker()
checker.check({
"API 层": ["认证与鉴权", "请求频率限制", "输入验证与清洗"],
"LLM 层": ["Prompt 版本管理", "流式响应"],
"运维层": ["健康检查端点"],
})

本章小结

下一章:生产级系统架构设计。