Agent 运维与持续迭代
Agent 上线只是开始。持续的运维、评估和迭代才能让 Agent 系统保持高质量。本章涵盖 Agent 的生命周期管理。
Agent 运维全景
graph TB
A[Agent 运维] --> B[版本管理]
A --> C[A/B 测试]
A --> D[持续评估]
A --> E[故障响应]
A --> F[迭代优化]
B --> B1[Prompt 版本化]
B --> B2[工具版本兼容]
C --> C1[流量分流]
C --> C2[效果对比]
D --> D1[自动化测试集]
D --> D2[用户反馈收集]
E --> E1[降级策略]
E --> E2[回滚机制]
F --> F1[数据驱动改进]
F --> F2[Prompt 调优]
style A fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style D fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
Prompt 版本管理
"""
Agent Prompt 版本管理
"""
import hashlib
import json
from dataclasses import dataclass, field
from datetime import datetime
@dataclass
class PromptVersion:
"""Prompt 版本"""
version_id: str
content: str
description: str
created_at: datetime
metrics: dict = field(default_factory=dict)
is_active: bool = False
@property
def content_hash(self) -> str:
return hashlib.sha256(self.content.encode()).hexdigest()[:12]
class PromptRegistry:
"""Prompt 版本注册表"""
def __init__(self):
self.versions: dict[str, list[PromptVersion]] = {} # agent_name -> versions
self._active: dict[str, PromptVersion] = {}
def register(self, agent_name: str, content: str, description: str) -> PromptVersion:
"""注册新版本"""
if agent_name not in self.versions:
self.versions[agent_name] = []
version_num = len(self.versions[agent_name]) + 1
version = PromptVersion(
version_id=f"v{version_num}",
content=content,
description=description,
created_at=datetime.now(),
)
self.versions[agent_name].append(version)
return version
def activate(self, agent_name: str, version_id: str) -> bool:
"""激活特定版本"""
for v in self.versions.get(agent_name, []):
if v.version_id == version_id:
# 停用旧版本
if agent_name in self._active:
self._active[agent_name].is_active = False
v.is_active = True
self._active[agent_name] = v
return True
return False
def get_active(self, agent_name: str) -> PromptVersion | None:
"""获取当前活跃版本"""
return self._active.get(agent_name)
def rollback(self, agent_name: str) -> bool:
"""回滚到上一个版本"""
versions = self.versions.get(agent_name, [])
if len(versions) < 2:
return False
current = self._active.get(agent_name)
if current is None:
return False
current_idx = next(
(i for i, v in enumerate(versions) if v.version_id == current.version_id),
-1,
)
if current_idx > 0:
return self.activate(agent_name, versions[current_idx - 1].version_id)
return False
A/B 测试框架
"""
Agent A/B 测试
"""
import random
from dataclasses import dataclass, field
from collections import defaultdict
@dataclass
class Variant:
"""实验变体"""
name: str
prompt_version: str
weight: float = 0.5 # 流量权重
@dataclass
class ExperimentResult:
"""单次实验结果"""
variant: str
success: bool
latency_ms: float
user_satisfaction: float | None = None
class ABTestFramework:
"""A/B 测试框架"""
def __init__(self, experiment_name: str, variants: list[Variant]):
self.name = experiment_name
self.variants = variants
self.results: dict[str, list[ExperimentResult]] = defaultdict(list)
# 验证权重之和
total_weight = sum(v.weight for v in variants)
if abs(total_weight - 1.0) > 0.01:
raise ValueError(f"权重之和应为 1.0,实际为 {total_weight}")
def assign_variant(self, user_id: str) -> Variant:
"""为用户分配变体(一致性哈希)"""
# 确保同一用户始终命中相同变体
hash_val = hash(f"{self.name}:{user_id}") % 1000 / 1000
cumulative = 0.0
for variant in self.variants:
cumulative += variant.weight
if hash_val < cumulative:
return variant
return self.variants[-1]
def record(self, result: ExperimentResult):
"""记录实验结果"""
self.results[result.variant].append(result)
def analyze(self) -> dict:
"""分析实验结果"""
analysis = {}
for variant_name, results in self.results.items():
if not results:
continue
successes = sum(1 for r in results if r.success)
latencies = [r.latency_ms for r in results]
satisfactions = [
r.user_satisfaction for r in results if r.user_satisfaction is not None
]
analysis[variant_name] = {
"sample_size": len(results),
"success_rate": successes / len(results),
"avg_latency_ms": sum(latencies) / len(latencies),
"avg_satisfaction": (
sum(satisfactions) / len(satisfactions) if satisfactions else None
),
}
return analysis
def get_winner(self) -> str | None:
"""判定优胜变体"""
analysis = self.analyze()
if len(analysis) < 2:
return None
# 综合评分:成功率 × 0.5 + (1 - 延迟归一化) × 0.2 + 满意度 × 0.3
best_variant = None
best_score = -1
for name, metrics in analysis.items():
score = metrics["success_rate"] * 0.5
if metrics["avg_satisfaction"] is not None:
score += metrics["avg_satisfaction"] * 0.3
if score > best_score:
best_score = score
best_variant = name
return best_variant
故障降级策略
"""
Agent 故障降级
"""
from dataclasses import dataclass
from enum import Enum
class DegradationLevel(Enum):
NORMAL = "normal" # 正常运行
CACHE_ONLY = "cache_only" # 仅使用缓存
RULE_BASED = "rule_based" # 退化为规则引擎
HUMAN_ONLY = "human_only" # 完全转人工
@dataclass
class HealthStatus:
"""系统健康状态"""
llm_available: bool = True
tool_available: bool = True
error_rate: float = 0.0
latency_p95_ms: float = 0.0
class DegradationController:
"""降级控制器"""
ERROR_RATE_THRESHOLD = 0.3 # 错误率超过 30% 触发降级
LATENCY_THRESHOLD_MS = 10000 # P95 延迟超过 10s 触发降级
def __init__(self):
self.current_level = DegradationLevel.NORMAL
self.rules: dict[str, str] = {} # 规则引擎备用规则
def evaluate(self, health: HealthStatus) -> DegradationLevel:
"""评估降级级别"""
if not health.llm_available:
self.current_level = DegradationLevel.RULE_BASED
elif health.error_rate > self.ERROR_RATE_THRESHOLD:
self.current_level = DegradationLevel.CACHE_ONLY
elif health.latency_p95_ms > self.LATENCY_THRESHOLD_MS:
self.current_level = DegradationLevel.CACHE_ONLY
else:
self.current_level = DegradationLevel.NORMAL
return self.current_level
def add_fallback_rule(self, intent: str, response: str):
"""添加备用规则"""
self.rules[intent] = response
def handle_degraded(self, intent: str) -> str:
"""降级模式处理"""
if self.current_level == DegradationLevel.RULE_BASED:
return self.rules.get(intent, "系统正在维护,请稍后再试。")
elif self.current_level == DegradationLevel.HUMAN_ONLY:
return "系统暂时无法自动处理,正在为您转接人工客服。"
return ""
运维检查清单
| 维度 | 检查项 | 频率 |
|---|---|---|
| 质量 | 自动测试集通过率 | 每次部署 |
| 质量 | 人工抽检满意度 | 每周 |
| 性能 | P95 延迟 < 3s | 持续监控 |
| 成本 | Token 用量趋势 | 每日 |
| 安全 | 输出安全扫描 | 每次部署 |
| 合规 | 数据保留和脱敏 | 每月审计 |
| 可用 | 服务可用率 > 99.5% | 持续监控 |
| 反馈 | 用户反馈分析 | 每周 |
本章小结
| 主题 | 要点 |
|---|---|
| Prompt 版本化 | 注册 → 激活 → 回滚,像代码一样管理 |
| A/B 测试 | 一致性哈希分流,多维度评分判定优胜 |
| 故障降级 | 4 级降级:正常 → 缓存 → 规则 → 人工 |
| 运维节奏 | 每次部署测试 + 每周抽检 + 每月审计 |
下一章:Agentic RAG