Agent 可观测性
Agent 系统在生产环境中遇到的核心挑战不是"它能不能工作",而是"它为什么这样工作"。可观测性让你理解 Agent 的决策链路、性能瓶颈和故障根因。
可观测性三大支柱
graph TB
A[Agent 可观测性] --> B[日志 Logging]
A --> C[追踪 Tracing]
A --> D[指标 Metrics]
B --> B1[结构化日志]
B --> B2[决策日志]
B --> B3[工具调用日志]
C --> C1[请求链路追踪]
C --> C2[Agent 步骤追踪]
C --> C3[LLM 调用追踪]
D --> D1[延迟分布]
D --> D2[Token 用量]
D --> D3[成功率/错误率]
style A fill:#e8eaf6,stroke:#3f51b5,stroke-width:3px
style C fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style D fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
追踪系统
"""
Agent 执行追踪系统
"""
import time
import uuid
from dataclasses import dataclass, field
from enum import Enum
class SpanType(Enum):
AGENT_RUN = "agent_run"
LLM_CALL = "llm_call"
TOOL_CALL = "tool_call"
RETRIEVAL = "retrieval"
PLANNING = "planning"
@dataclass
class Span:
"""追踪 Span"""
span_id: str
trace_id: str
parent_id: str | None
name: str
span_type: SpanType
start_time: float
end_time: float | None = None
attributes: dict = field(default_factory=dict)
events: list[dict] = field(default_factory=list)
status: str = "ok"
@property
def duration_ms(self) -> float | None:
if self.end_time is None:
return None
return (self.end_time - self.start_time) * 1000
class AgentTracer:
"""Agent 追踪器"""
def __init__(self):
self.spans: list[Span] = []
self._active_trace: str | None = None
self._span_stack: list[Span] = []
def start_trace(self, name: str) -> str:
"""开始一个追踪"""
trace_id = uuid.uuid4().hex[:16]
self._active_trace = trace_id
self._start_span(name, SpanType.AGENT_RUN, trace_id)
return trace_id
def start_llm_span(self, model: str, prompt_tokens: int = 0) -> Span:
"""开始 LLM 调用 Span"""
span = self._start_span(f"llm:{model}", SpanType.LLM_CALL)
span.attributes["model"] = model
span.attributes["prompt_tokens"] = prompt_tokens
return span
def start_tool_span(self, tool_name: str, params: dict | None = None) -> Span:
"""开始工具调用 Span"""
span = self._start_span(f"tool:{tool_name}", SpanType.TOOL_CALL)
span.attributes["tool"] = tool_name
if params:
span.attributes["params"] = str(params)[:200]
return span
def end_span(self, **attributes) -> Span | None:
"""结束当前 Span"""
if not self._span_stack:
return None
span = self._span_stack.pop()
span.end_time = time.time()
span.attributes.update(attributes)
return span
def add_event(self, name: str, attributes: dict | None = None):
"""在当前 Span 上添加事件"""
if self._span_stack:
self._span_stack[-1].events.append({
"name": name,
"timestamp": time.time(),
"attributes": attributes or {},
})
def end_trace(self) -> list[Span]:
"""结束追踪"""
while self._span_stack:
self.end_span()
self._active_trace = None
return self.spans
def get_trace_summary(self, trace_id: str) -> dict:
"""获取追踪摘要"""
trace_spans = [s for s in self.spans if s.trace_id == trace_id]
total_duration = sum(s.duration_ms or 0 for s in trace_spans)
llm_duration = sum(
s.duration_ms or 0
for s in trace_spans
if s.span_type == SpanType.LLM_CALL
)
tool_duration = sum(
s.duration_ms or 0
for s in trace_spans
if s.span_type == SpanType.TOOL_CALL
)
return {
"trace_id": trace_id,
"total_spans": len(trace_spans),
"total_duration_ms": round(total_duration, 2),
"llm_duration_ms": round(llm_duration, 2),
"tool_duration_ms": round(tool_duration, 2),
"llm_ratio": f"{llm_duration / total_duration:.0%}" if total_duration > 0 else "N/A",
}
def _start_span(
self, name: str, span_type: SpanType, trace_id: str | None = None
) -> Span:
parent_id = self._span_stack[-1].span_id if self._span_stack else None
span = Span(
span_id=uuid.uuid4().hex[:16],
trace_id=trace_id or self._active_trace or "",
parent_id=parent_id,
name=name,
span_type=span_type,
start_time=time.time(),
)
self.spans.append(span)
self._span_stack.append(span)
return span
指标收集器
"""
Agent 运行指标收集
"""
import time
from dataclasses import dataclass, field
from collections import defaultdict
@dataclass
class MetricsBucket:
"""指标桶"""
count: int = 0
total_value: float = 0.0
min_value: float = float("inf")
max_value: float = float("-inf")
values: list[float] = field(default_factory=list)
def record(self, value: float):
self.count += 1
self.total_value += value
self.min_value = min(self.min_value, value)
self.max_value = max(self.max_value, value)
self.values.append(value)
@property
def avg(self) -> float:
return self.total_value / self.count if self.count > 0 else 0.0
def percentile(self, p: float) -> float:
if not self.values:
return 0.0
sorted_vals = sorted(self.values)
idx = int(len(sorted_vals) * p / 100)
return sorted_vals[min(idx, len(sorted_vals) - 1)]
class AgentMetrics:
"""Agent 指标收集器"""
def __init__(self):
self.latency = defaultdict(MetricsBucket) # 延迟 (ms)
self.token_usage = defaultdict(MetricsBucket) # Token 用量
self.error_counts: dict[str, int] = defaultdict(int)
self.success_count = 0
self.total_count = 0
def record_latency(self, operation: str, duration_ms: float):
self.latency[operation].record(duration_ms)
def record_tokens(self, model: str, tokens: int):
self.token_usage[model].record(tokens)
def record_success(self):
self.success_count += 1
self.total_count += 1
def record_error(self, error_type: str):
self.error_counts[error_type] += 1
self.total_count += 1
def report(self) -> dict:
"""生成指标报告"""
return {
"success_rate": f"{self.success_count / self.total_count:.1%}" if self.total_count > 0 else "N/A",
"total_requests": self.total_count,
"latency": {
op: {
"avg_ms": round(bucket.avg, 1),
"p95_ms": round(bucket.percentile(95), 1),
"p99_ms": round(bucket.percentile(99), 1),
}
for op, bucket in self.latency.items()
},
"tokens": {
model: {
"total": int(bucket.total_value),
"avg_per_call": round(bucket.avg, 0),
}
for model, bucket in self.token_usage.items()
},
"errors": dict(self.error_counts),
}
观测工具对比
| 工具 | 类型 | Agent 支持 | 特点 | 价格 |
|---|---|---|---|---|
| LangFuse | 开源 | LangChain/通用 | 追踪+评估+Prompt 管理 | 免费/付费 |
| LangSmith | SaaS | LangChain | 深度集成 LangChain 生态 | 付费 |
| Phoenix | 开源 | 通用 | OpenTelemetry 兼容 | 免费 |
| Helicone | SaaS | API 代理 | 零代码集入 | 免费/付费 |
| OpenLLMetry | 开源 | 通用 | OpenTelemetry 标准 | 免费 |
可观测性设计原则
graph LR
A[设计原则] --> B[结构化日志]
A --> C[关联 ID 贯穿]
A --> D[采样策略]
A --> E[敏感数据脱敏]
B --> B1[JSON 格式 + 统一字段]
C --> C1[trace_id 贯穿所有服务]
D --> D1[全量追踪 → 10% 采样]
E --> E1[PII 自动遮蔽]
style A fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
本章小结
| 主题 | 要点 |
|---|---|
| 三大支柱 | 日志 + 追踪 + 指标,缺一不可 |
| 追踪系统 | Span 嵌套结构,记录 Agent→LLM→Tool 链路 |
| 关键指标 | 延迟分位数、Token 用量、成功率/错误率 |
| 工具选型 | LangFuse(开源首选)、LangSmith(LangChain 生态) |
| 设计原则 | 结构化 + 关联 ID + 脱敏 + 采样 |
下一章:部署与企业实践