缓存系统设计
缓存是 LLM 生产系统中性价比最高的优化手段——同时降低延迟和成本。
缓存策略全景
graph TB
A[LLM 缓存] --> B[精确匹配缓存]
A --> C[语义缓存]
A --> D[前缀缓存]
B --> B1[哈希匹配]
B --> B2[适合:固定问答]
C --> C1[向量相似度匹配]
C --> C2[适合:自然语言变体]
D --> D1[KV Cache 复用]
D --> D2[适合:相同 System Prompt]
style A fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style C fill:#c8e6c9,stroke:#388e3c,stroke-width:2px
多层缓存架构
"""
多层缓存系统
"""
import hashlib
import time
from typing import Optional
from collections import OrderedDict
class LRUCache:
"""内存 LRU 缓存(L1)"""
def __init__(self, capacity: int = 1000):
self.capacity = capacity
self.cache: OrderedDict = OrderedDict()
def get(self, key: str) -> Optional[str]:
if key in self.cache:
self.cache.move_to_end(key)
return self.cache[key]["value"]
return None
def set(self, key: str, value: str, ttl: float = 3600) -> None:
if key in self.cache:
self.cache.move_to_end(key)
elif len(self.cache) >= self.capacity:
self.cache.popitem(last=False)
self.cache[key] = {
"value": value,
"expires": time.time() + ttl,
}
def cleanup(self) -> int:
"""清理过期条目"""
now = time.time()
expired = [
k for k, v in self.cache.items()
if v["expires"] < now
]
for k in expired:
del self.cache[k]
return len(expired)
class RedisCache:
"""Redis 缓存模拟(L2)"""
def __init__(self):
self.store: dict = {}
def get(self, key: str) -> Optional[str]:
entry = self.store.get(key)
if entry and entry["expires"] > time.time():
return entry["value"]
return None
def set(self, key: str, value: str, ttl: float = 7200) -> None:
self.store[key] = {
"value": value,
"expires": time.time() + ttl,
}
class MultiLayerCache:
"""多层缓存"""
def __init__(self):
self.l1 = LRUCache(capacity=500) # 内存缓存,快、小
self.l2 = RedisCache() # Redis 缓存,慢、大
self.stats = {"l1_hit": 0, "l2_hit": 0, "miss": 0}
def _make_key(self, query: str, model: str) -> str:
"""生成缓存键"""
normalized = query.strip().lower()
raw = f"{model}:{normalized}"
return hashlib.sha256(raw.encode()).hexdigest()
def get(self, query: str, model: str = "default") -> Optional[str]:
"""查询缓存"""
key = self._make_key(query, model)
# L1
result = self.l1.get(key)
if result:
self.stats["l1_hit"] += 1
return result
# L2
result = self.l2.get(key)
if result:
self.stats["l2_hit"] += 1
# 回填 L1
self.l1.set(key, result)
return result
self.stats["miss"] += 1
return None
def set(
self, query: str, response: str, model: str = "default"
) -> None:
"""设置缓存"""
key = self._make_key(query, model)
self.l1.set(key, response, ttl=1800) # L1: 30分钟
self.l2.set(key, response, ttl=7200) # L2: 2小时
def get_stats(self) -> dict:
total = sum(self.stats.values())
hit = self.stats["l1_hit"] + self.stats["l2_hit"]
return {
**self.stats,
"total": total,
"hit_rate": f"{hit / max(total, 1):.1%}",
}
# 使用
cache = MultiLayerCache()
# 第一次请求:缓存未命中
r = cache.get("什么是Python?")
print(f"首次查询: {r}") # None
# 缓存结果
cache.set("什么是Python?", "Python是一种编程语言...")
# 第二次请求:L1 命中
r = cache.get("什么是Python?")
print(f"第二次: {r[:20]}...") # 从 L1
print(f"统计: {cache.get_stats()}")
缓存一致性
"""
缓存失效策略
"""
class CacheInvalidator:
"""缓存失效管理"""
def __init__(self, cache: MultiLayerCache):
self.cache = cache
self.invalidation_rules: list[dict] = []
def add_rule(
self,
trigger: str,
pattern: str,
action: str = "delete",
) -> None:
"""添加失效规则"""
self.invalidation_rules.append({
"trigger": trigger,
"pattern": pattern,
"action": action,
})
def on_model_update(self, old_model: str, new_model: str) -> int:
"""模型更新时清理缓存"""
# 实际中按 model 前缀批量删除
count = 0
keys_to_delete = []
for key, entry in list(self.cache.l1.cache.items()):
# 简化:清空所有缓存
keys_to_delete.append(key)
for key in keys_to_delete:
if key in self.cache.l1.cache:
del self.cache.l1.cache[key]
count += 1
print(f"模型更新 {old_model} → {new_model},清理 {count} 条缓存")
return count
def on_knowledge_update(self, doc_ids: list[str]) -> int:
"""知识库更新时失效相关缓存"""
# 实际中需要维护 doc_id → cache_key 的映射
count = 0
print(f"知识库更新 {len(doc_ids)} 篇文档,需清理相关缓存")
return count
# 失效策略对比
INVALIDATION_STRATEGIES = {
"TTL 过期": {
"说明": "设置过期时间,到期自动删除",
"优点": "实现简单",
"缺点": "可能返回过期数据",
"适用": "数据变化不频繁",
},
"主动失效": {
"说明": "数据源变更时主动清除缓存",
"优点": "数据一致性好",
"缺点": "需要维护依赖关系",
"适用": "知识库更新频繁",
},
"版本号": {
"说明": "缓存键包含版本号,更新后版本号变化",
"优点": "简单有效",
"缺点": "旧版本占空间",
"适用": "模型或 Prompt 更新",
},
}
for name, info in INVALIDATION_STRATEGIES.items():
print(f"\n{name}:")
for k, v in info.items():
print(f" {k}: {v}")
缓存效果评估
| 场景 | 缓存命中率 | 延迟降低 | 成本降低 |
|---|---|---|---|
| FAQ 客服 | 60-80% | 70% | 70% |
| 代码助手 | 10-30% | 15% | 15% |
| 创意写作 | 5-10% | 5% | 5% |
| 数据分析 | 30-50% | 35% | 35% |
| 文档问答 | 40-60% | 45% | 45% |
经验法则:重复性越高的场景,缓存收益越大。
本章小结
- L1 (内存) + L2 (Redis) 多层缓存架构
- 精确缓存适合固定问答,语义缓存适合自然变体
- 缓存失效策略选择:TTL / 主动失效 / 版本号
- 模型更新和知识库更新时必须清理缓存
- FAQ 类场景缓存收益最高
下一章:监控与可观测性。