3 min read547 words

性能与稳定性检测

为什么性能评估不能只看准确率

一个 LLM 应用在实验室里表现完美，但在生产环境中可能因为延迟、吞吐量或稳定性问题而不可用。性能评估确保模型在真实负载下依然可靠。

graph TB A[性能评估维度] --> B[延迟 Latency] A --> C[吞吐量 Throughput] A --> D[稳定性 Stability] A --> E[成本效率 Cost] B --> B1[首 Token 延迟 TTFT] B --> B2[Token 间延迟 ITL] B --> B3[端到端延迟 E2E] C --> C1[并发请求数] C --> C2[Token/秒吞吐] C --> C3[批处理效率] D --> D1[输出一致性] D --> D2[长对话衰退] D --> D3[错误恢复] E --> E1[每请求成本] E --> E2[缓存命中率] E --> E3[模型选择 ROI] style A fill:#e3f2fd,stroke:#1976d2,stroke-width:3px style B fill:#fff3e0,stroke:#f57c00,stroke-width:2px style C fill:#c8e6c9,stroke:#388e3c,stroke-width:2px style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px style E fill:#ffcdd2,stroke:#c62828,stroke-width:2px

延迟基准测试

关键延迟指标

指标	定义	目标范围	测量方法
TTFT (Time to First Token)	从请求发送到首个 Token 返回	<500ms	流式响应计时
ITL (Inter-Token Latency)	连续 Token 之间的间隔	<50ms	流式 Token 时间差
E2E (End-to-End)	完整请求的总延迟	<5s (短文本)	请求-响应计时
P50/P95/P99	延迟百分位	P99 < 3×P50	统计分析

"""
LLM 延迟基准测试工具
"""
import time
import statistics
from dataclasses import dataclass, field
@dataclass
class LatencyResult:
"""延迟测试结果"""
ttft_ms: float
itl_ms: list[float]
e2e_ms: float
token_count: int
@property
def avg_itl(self) -> float:
return statistics.mean(self.itl_ms) if self.itl_ms else 0
@property
def tokens_per_second(self) -> float:
return self.token_count / (self.e2e_ms / 1000) if self.e2e_ms > 0 else 0
@dataclass
class BenchmarkReport:
"""基准测试报告"""
model_name: str
results: list[LatencyResult] = field(default_factory=list)
def summary(self) -> dict:
"""生成统计摘要"""
ttfts = [r.ttft_ms for r in self.results]
e2es = [r.e2e_ms for r in self.results]
tps = [r.tokens_per_second for r in self.results]
return {
"model": self.model_name,
"requests": len(self.results),
"ttft": {
"p50": statistics.median(ttfts),
"p95": sorted(ttfts)[int(len(ttfts) * 0.95)],
"p99": sorted(ttfts)[int(len(ttfts) * 0.99)],
},
"e2e": {
"p50": statistics.median(e2es),
"p95": sorted(e2es)[int(len(e2es) * 0.95)],
"mean": statistics.mean(e2es),
},
"throughput": {
"avg_tps": statistics.mean(tps),
"max_tps": max(tps),
},
}
def print_report(self):
"""打印报告"""
s = self.summary()
print(f"\n{'='*50}")
print(f"模型: {s['model']}  |  请求数: {s['requests']}")
print(f"{'='*50}")
print(f"TTFT  P50: {s['ttft']['p50']:.0f}ms  "
f"P95: {s['ttft']['p95']:.0f}ms  "
f"P99: {s['ttft']['p99']:.0f}ms")
print(f"E2E   P50: {s['e2e']['p50']:.0f}ms  "
f"P95: {s['e2e']['p95']:.0f}ms  "
f"Mean: {s['e2e']['mean']:.0f}ms")
print(f"吞吐  Avg: {s['throughput']['avg_tps']:.1f} tok/s  "
f"Max: {s['throughput']['max_tps']:.1f} tok/s")

并发压力测试

graph LR subgraph 压力测试流程 A[定义负载模式] --> B[逐步增压] B --> C[记录指标] C --> D[识别瓶颈] D --> E[报告与决策] end subgraph 负载模式 F[恒定负载] --> F1[固定 QPS] G[阶梯负载] --> G1[逐步递增] H[脉冲负载] --> H1[突发流量] I[混合负载] --> I1[长短请求混合] end style A fill:#e3f2fd,stroke:#1976d2,stroke-width:2px style E fill:#c8e6c9,stroke:#388e3c,stroke-width:2px

"""
并发压力测试框架
"""
import asyncio
from dataclasses import dataclass
@dataclass
class LoadTestConfig:
"""负载测试配置"""
concurrent_users: int = 10
requests_per_user: int = 5
ramp_up_seconds: int = 10
prompt_template: str = "请用100字介绍{topic}"
topics: list[str] = None
def __post_init__(self):
if self.topics is None:
self.topics = ["Python", "机器学习", "数据库", "微服务", "DevOps"]
class ConcurrencyBenchmark:
"""并发基准测试"""
def __init__(self, model_fn, config: LoadTestConfig):
self.model_fn = model_fn
self.config = config
self.latencies = []
self.errors = 0
async def _single_request(self, prompt: str) -> float:
"""单次请求"""
start = asyncio.get_event_loop().time()
try:
await self.model_fn(prompt)
elapsed = (asyncio.get_event_loop().time() - start) * 1000
self.latencies.append(elapsed)
return elapsed
except Exception:
self.errors += 1
return -1
async def run(self) -> dict:
"""执行负载测试"""
tasks = []
for i in range(self.config.concurrent_users):
for j in range(self.config.requests_per_user):
topic = self.config.topics[j % len(self.config.topics)]
prompt = self.config.prompt_template.format(topic=topic)
tasks.append(self._single_request(prompt))
# 阶梯式启动
if i < self.config.concurrent_users - 1:
delay = self.config.ramp_up_seconds / self.config.concurrent_users
await asyncio.sleep(delay)
await asyncio.gather(*tasks)
total = self.config.concurrent_users * self.config.requests_per_user
return {
"total_requests": total,
"successful": len(self.latencies),
"failed": self.errors,
"error_rate": self.errors / total * 100,
"avg_latency_ms": statistics.mean(self.latencies) if self.latencies else 0,
"p95_latency_ms": sorted(self.latencies)[int(len(self.latencies) * 0.95)] if self.latencies else 0,
}

输出稳定性评估

稳定性维度	评估方法	通过标准
一致性	同一 Prompt 运行 10 次	语义相似度 >0.85
长对话衰退	20 轮后重测核心能力	准确率下降 <10%
上下文容量	逐步增加输入长度	在声明窗口内准确
格式稳定	JSON/表格输出 100 次	格式正确率 >95%
错误恢复	注入异常输入后正常提问	恢复正常率 100%

模型对比基准

指标	GPT-4o	Claude 3.5 Sonnet	DeepSeek V3	Gemini 2.0
TTFT (P50)	~300ms	~250ms	~200ms	~350ms
吞吐 (tok/s)	~80	~90	~120	~75
128K 上下文稳定性	★★★★	★★★★★	★★★★	★★★★
JSON 格式一致率	96%	98%	94%	95%
并发 50 错误率	<1%	<1%	<2%	<1%
每百万 Token 成本	$2.50	$3.00	$0.27	$1.25

注：以上为 2026 年初的参考数据，实际性能因地区和部署方式而异。

本章小结

性能评估需关注延迟（TTFT/ITL/E2E）、吞吐量、稳定性和成本四个维度
基准测试需模拟真实生产负载，包括并发、长对话和异常场景
使用百分位指标（P50/P95/P99）而非均值来衡量延迟
输出稳定性评估确保模型在重复调用中保持一致
定期重新跑基准，因为模型服务商会持续优化性能

下一章：进入生产评估环节，学习 A/B 测试与持续监控策略。