1 min read228 words

缓存系统设计

缓存是 LLM 生产系统中性价比最高的优化手段——同时降低延迟和成本。

缓存策略全景

graph TB A[LLM 缓存] --> B[精确匹配缓存] A --> C[语义缓存] A --> D[前缀缓存] B --> B1[哈希匹配] B --> B2[适合：固定问答] C --> C1[向量相似度匹配] C --> C2[适合：自然语言变体] D --> D1[KV Cache 复用] D --> D2[适合：相同 System Prompt] style A fill:#e3f2fd,stroke:#1976d2,stroke-width:2px style C fill:#c8e6c9,stroke:#388e3c,stroke-width:2px

多层缓存架构

"""
多层缓存系统
"""
import hashlib
import time
from typing import Optional
from collections import OrderedDict
class LRUCache:
"""内存 LRU 缓存（L1）"""
def __init__(self, capacity: int = 1000):
self.capacity = capacity
self.cache: OrderedDict = OrderedDict()
def get(self, key: str) -> Optional[str]:
if key in self.cache:
self.cache.move_to_end(key)
return self.cache[key]["value"]
return None
def set(self, key: str, value: str, ttl: float = 3600) -> None:
if key in self.cache:
self.cache.move_to_end(key)
elif len(self.cache) >= self.capacity:
self.cache.popitem(last=False)
self.cache[key] = {
"value": value,
"expires": time.time() + ttl,
}
def cleanup(self) -> int:
"""清理过期条目"""
now = time.time()
expired = [
k for k, v in self.cache.items()
if v["expires"] < now
]
for k in expired:
del self.cache[k]
return len(expired)
class RedisCache:
"""Redis 缓存模拟（L2）"""
def __init__(self):
self.store: dict = {}
def get(self, key: str) -> Optional[str]:
entry = self.store.get(key)
if entry and entry["expires"] > time.time():
return entry["value"]
return None
def set(self, key: str, value: str, ttl: float = 7200) -> None:
self.store[key] = {
"value": value,
"expires": time.time() + ttl,
}
class MultiLayerCache:
"""多层缓存"""
def __init__(self):
self.l1 = LRUCache(capacity=500)      # 内存缓存，快、小
self.l2 = RedisCache()                  # Redis 缓存，慢、大
self.stats = {"l1_hit": 0, "l2_hit": 0, "miss": 0}
def _make_key(self, query: str, model: str) -> str:
"""生成缓存键"""
normalized = query.strip().lower()
raw = f"{model}:{normalized}"
return hashlib.sha256(raw.encode()).hexdigest()
def get(self, query: str, model: str = "default") -> Optional[str]:
"""查询缓存"""
key = self._make_key(query, model)
# L1
result = self.l1.get(key)
if result:
self.stats["l1_hit"] += 1
return result
# L2
result = self.l2.get(key)
if result:
self.stats["l2_hit"] += 1
# 回填 L1
self.l1.set(key, result)
return result
self.stats["miss"] += 1
return None
def set(
self, query: str, response: str, model: str = "default"
) -> None:
"""设置缓存"""
key = self._make_key(query, model)
self.l1.set(key, response, ttl=1800)   # L1: 30分钟
self.l2.set(key, response, ttl=7200)   # L2: 2小时
def get_stats(self) -> dict:
total = sum(self.stats.values())
hit = self.stats["l1_hit"] + self.stats["l2_hit"]
return {
**self.stats,
"total": total,
"hit_rate": f"{hit / max(total, 1):.1%}",
}
# 使用
cache = MultiLayerCache()
# 第一次请求：缓存未命中
r = cache.get("什么是Python？")
print(f"首次查询: {r}")  # None
# 缓存结果
cache.set("什么是Python？", "Python是一种编程语言...")
# 第二次请求：L1 命中
r = cache.get("什么是Python？")
print(f"第二次: {r[:20]}...")  # 从 L1
print(f"统计: {cache.get_stats()}")

缓存一致性

"""
缓存失效策略
"""
class CacheInvalidator:
"""缓存失效管理"""
def __init__(self, cache: MultiLayerCache):
self.cache = cache
self.invalidation_rules: list[dict] = []
def add_rule(
self,
trigger: str,
pattern: str,
action: str = "delete",
) -> None:
"""添加失效规则"""
self.invalidation_rules.append({
"trigger": trigger,
"pattern": pattern,
"action": action,
})
def on_model_update(self, old_model: str, new_model: str) -> int:
"""模型更新时清理缓存"""
# 实际中按 model 前缀批量删除
count = 0
keys_to_delete = []
for key, entry in list(self.cache.l1.cache.items()):
# 简化：清空所有缓存
keys_to_delete.append(key)
for key in keys_to_delete:
if key in self.cache.l1.cache:
del self.cache.l1.cache[key]
count += 1
print(f"模型更新 {old_model} → {new_model}，清理 {count} 条缓存")
return count
def on_knowledge_update(self, doc_ids: list[str]) -> int:
"""知识库更新时失效相关缓存"""
# 实际中需要维护 doc_id → cache_key 的映射
count = 0
print(f"知识库更新 {len(doc_ids)} 篇文档，需清理相关缓存")
return count
# 失效策略对比
INVALIDATION_STRATEGIES = {
"TTL 过期": {
"说明": "设置过期时间，到期自动删除",
"优点": "实现简单",
"缺点": "可能返回过期数据",
"适用": "数据变化不频繁",
},
"主动失效": {
"说明": "数据源变更时主动清除缓存",
"优点": "数据一致性好",
"缺点": "需要维护依赖关系",
"适用": "知识库更新频繁",
},
"版本号": {
"说明": "缓存键包含版本号，更新后版本号变化",
"优点": "简单有效",
"缺点": "旧版本占空间",
"适用": "模型或 Prompt 更新",
},
}
for name, info in INVALIDATION_STRATEGIES.items():
print(f"\n{name}:")
for k, v in info.items():
print(f"  {k}: {v}")

缓存效果评估

场景	缓存命中率	延迟降低	成本降低
FAQ 客服	60-80%	70%	70%
代码助手	10-30%	15%	15%
创意写作	5-10%	5%	5%
数据分析	30-50%	35%	35%
文档问答	40-60%	45%	45%

经验法则：重复性越高的场景，缓存收益越大。

本章小结

L1 (内存) + L2 (Redis) 多层缓存架构
精确缓存适合固定问答，语义缓存适合自然变体
缓存失效策略选择：TTL / 主动失效 / 版本号
模型更新和知识库更新时必须清理缓存
FAQ 类场景缓存收益最高

下一章：监控与可观测性。