稳定性保障与故障排查
生产系统必须为失败做准备。限流、熔断、降级——构建弹性 LLM 系统。
稳定性体系
graph TB
A[稳定性保障] --> B[预防]
A --> C[应对]
A --> D[恢复]
B --> B1[限流]
B --> B2[超时]
B --> B3[容量规划]
C --> C1[熔断]
C --> C2[降级]
C --> C3[重试]
D --> D1[自动恢复]
D --> D2[灰度回滚]
D --> D3[故障复盘]
style A fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style C fill:#fff3e0,stroke:#f57c00,stroke-width:2px
限流与降级
"""
限流与降级策略
"""
import time
import enum
from dataclasses import dataclass
class DegradationLevel(enum.Enum):
"""降级等级"""
NORMAL = "normal"
LIGHT = "light" # 轻度降级
MODERATE = "moderate" # 中度降级
HEAVY = "heavy" # 重度降级
EMERGENCY = "emergency" # 紧急降级
@dataclass
class DegradationPolicy:
"""降级策略"""
level: DegradationLevel
model: str
max_tokens: int
features_disabled: list[str]
cache_only: bool = False
class AdaptiveDegrader:
"""自适应降级器"""
POLICIES = {
DegradationLevel.NORMAL: DegradationPolicy(
level=DegradationLevel.NORMAL,
model="gpt-4o",
max_tokens=4096,
features_disabled=[],
),
DegradationLevel.LIGHT: DegradationPolicy(
level=DegradationLevel.LIGHT,
model="gpt-4o-mini",
max_tokens=2048,
features_disabled=["streaming"],
),
DegradationLevel.MODERATE: DegradationPolicy(
level=DegradationLevel.MODERATE,
model="gpt-4o-mini",
max_tokens=512,
features_disabled=["streaming", "rag"],
),
DegradationLevel.HEAVY: DegradationPolicy(
level=DegradationLevel.HEAVY,
model="gpt-4o-mini",
max_tokens=256,
features_disabled=["streaming", "rag", "history"],
),
DegradationLevel.EMERGENCY: DegradationPolicy(
level=DegradationLevel.EMERGENCY,
model="cache_only",
max_tokens=0,
features_disabled=["all"],
cache_only=True,
),
}
def __init__(self):
self.current_level = DegradationLevel.NORMAL
self.error_window: list[float] = []
self.window_size = 60 # 60秒窗口
def record_error(self) -> None:
"""记录错误"""
self.error_window.append(time.time())
# 清理过期数据
cutoff = time.time() - self.window_size
self.error_window = [
t for t in self.error_window if t > cutoff
]
self._auto_adjust()
def record_success(self) -> None:
"""记录成功"""
# 检查是否可以恢复
self._try_recover()
def _auto_adjust(self) -> None:
"""自动调整降级等级"""
error_count = len(self.error_window)
if error_count > 50:
self._set_level(DegradationLevel.EMERGENCY)
elif error_count > 30:
self._set_level(DegradationLevel.HEAVY)
elif error_count > 15:
self._set_level(DegradationLevel.MODERATE)
elif error_count > 5:
self._set_level(DegradationLevel.LIGHT)
def _try_recover(self) -> None:
"""尝试恢复"""
error_count = len(self.error_window)
if error_count < 3 and self.current_level != DegradationLevel.NORMAL:
levels = list(DegradationLevel)
current_idx = levels.index(self.current_level)
if current_idx > 0:
new_level = levels[current_idx - 1]
self._set_level(new_level)
def _set_level(self, level: DegradationLevel) -> None:
if level != self.current_level:
print(f" 降级等级变更: {self.current_level.value} → {level.value}")
self.current_level = level
def get_policy(self) -> DegradationPolicy:
return self.POLICIES[self.current_level]
# 使用
degrader = AdaptiveDegrader()
# 模拟错误增加
for i in range(20):
degrader.record_error()
policy = degrader.get_policy()
if i % 5 == 0:
print(f" 错误 #{i+1}: 当前策略 → model={policy.model}, "
f"max_tokens={policy.max_tokens}")
灰度发布
"""
灰度发布策略
"""
import random
import hashlib
class GrayRelease:
"""灰度发布管理"""
def __init__(self):
self.rollout_percentage: float = 0 # 0-100
self.whitelist: set[str] = set()
self.blacklist: set[str] = set()
self.new_version_config: dict = {}
self.old_version_config: dict = {}
def configure(
self,
rollout_pct: float,
new_config: dict,
old_config: dict,
) -> None:
"""配置灰度"""
self.rollout_percentage = rollout_pct
self.new_version_config = new_config
self.old_version_config = old_config
print(f"灰度配置: {rollout_pct}% 使用新版本")
def route_user(self, user_id: str) -> dict:
"""决定用户使用哪个版本"""
# 白名单优先
if user_id in self.whitelist:
return self.new_version_config
if user_id in self.blacklist:
return self.old_version_config
# 一致性哈希 - 同一用户总是进同一组
hash_val = int(
hashlib.md5(user_id.encode()).hexdigest()[:8], 16
)
bucket = hash_val % 100
if bucket < self.rollout_percentage:
return self.new_version_config
return self.old_version_config
# 使用
gray = GrayRelease()
gray.configure(
rollout_pct=10,
new_config={"model": "gpt-4o", "prompt_version": "v2.0"},
old_config={"model": "gpt-4o-mini", "prompt_version": "v1.5"},
)
# 灰度验证
results = {"new": 0, "old": 0}
for i in range(1000):
config = gray.route_user(f"user_{i}")
if config["prompt_version"] == "v2.0":
results["new"] += 1
else:
results["old"] += 1
print(f"灰度分布: 新版 {results['new']}, 旧版 {results['old']}")
故障排查流程
graph TB
A[发现异常] --> B{哪类问题?}
B -->|延迟高| C[检查模型 API 延迟]
B -->|错误多| D[检查错误日志]
B -->|质量差| E[检查 Prompt 变更]
C --> C1[API 提供商故障?]
C --> C2[请求量突增?]
C --> C3[Prompt 变长?]
D --> D1[429 限流?]
D --> D2[500 服务端?]
D --> D3[超时?]
E --> E1[Prompt 版本回退]
E --> E2[模型版本变更?]
E --> E3[知识库数据问题?]
C1 -->|是| F[启用 Fallback]
D1 -->|是| G[降低 QPS / 等待]
D2 -->|是| H[检查日志 + 重启]
style A fill:#ffcdd2,stroke:#c62828,stroke-width:2px
style F fill:#c8e6c9,stroke:#388e3c,stroke-width:2px
"""
故障排查工具
"""
class TroubleshootingGuide:
"""故障排查指南"""
COMMON_ISSUES = {
"429 Rate Limit": {
"symptoms": ["请求被拒绝", "HTTP 429"],
"causes": ["QPS 超限", "Token/分钟限制", "并发数限制"],
"solutions": [
"1. 检查 rate limit headers",
"2. 降低并发数",
"3. 启用请求队列",
"4. 联系供应商提升限额",
],
},
"高延迟": {
"symptoms": ["P95 > 3s", "用户投诉慢"],
"causes": ["模型负载高", "Prompt 过长", "网络问题"],
"solutions": [
"1. 启用流式响应",
"2. 压缩 Prompt",
"3. 用更小的模型",
"4. 启用缓存",
],
},
"输出质量下降": {
"symptoms": ["用户满意度下降", "评估分数降低"],
"causes": ["模型版本更新", "Prompt drift", "数据质量"],
"solutions": [
"1. 检查模型版本是否变更",
"2. 对比 Prompt 历史版本",
"3. 运行评估基准测试",
"4. 回滚到上一版本",
],
},
"成本突增": {
"symptoms": ["日成本翻倍", "Token 消耗异常"],
"causes": ["流量增长", "Prompt 变长", "缓存失效", "模型升级"],
"solutions": [
"1. 检查请求量趋势",
"2. 检查平均 Token 数",
"3. 检查缓存命中率",
"4. 检查模型路由分布",
],
},
}
@classmethod
def diagnose(cls, symptom: str) -> None:
"""根据症状诊断"""
print(f"\n诊断: {symptom}")
for issue, info in cls.COMMON_ISSUES.items():
if any(s in symptom for s in info["symptoms"]):
print(f"\n可能问题: {issue}")
print(" 可能原因:")
for c in info["causes"]:
print(f" - {c}")
print(" 解决方案:")
for s in info["solutions"]:
print(f" {s}")
# 使用
TroubleshootingGuide.diagnose("P95 > 3s")
TroubleshootingGuide.diagnose("HTTP 429")
故障复盘模板
POSTMORTEM_TEMPLATE = """
# 故障复盘
## 基本信息
- **故障时间**: YYYY-MM-DD HH:MM - HH:MM
- **影响范围**: X% 用户受影响
- **影响时长**: X 分钟
- **严重等级**: P1/P2/P3
## 时间线
- HH:MM 告警触发
- HH:MM 人员响应
- HH:MM 定位原因
- HH:MM 执行修复
- HH:MM 确认恢复
## 根因分析
(5 个 Why 法)
1. 为什么出现故障?→ ...
2. 为什么没有被预防?→ ...
3. 为什么没有更快发现?→ ...
## 改进措施
| 序号 | 措施 | 负责人 | 完成时间 |
|------|------|--------|----------|
| 1 | 添加告警规则 | - | - |
| 2 | 增加自动熔断 | - | - |
| 3 | 完善文档 | - | - |
"""
print(POSTMORTEM_TEMPLATE)
本章小结
- 限流保护系统不被压垮
- 自适应降级根据错误率自动调整服务质量
- 灰度发布降低变更风险
- 故障排查:先看指标 → 再看日志 → 最后看追踪
- 每次故障都要做复盘,转化为改进措施
下一章:安全合规与 CI/CD 实践。