1 min read246 words

LLM 应用开发模式

从 API 调用到生产应用，有几种经典的架构模式。选择正确的模式是构建可靠 LLM 应用的第一步。

四种核心模式

graph TB A[LLM 应用模式] --> B[直接调用] A --> C[Chain 链式] A --> D[RAG 增强] A --> E[Agent 自主] B --> B1[简单问答
翻译/摘要] C --> C1[多步处理
提取→分析→生成] D --> D1[知识检索
文档问答] E --> E1[工具使用
复杂推理] style A fill:#e3f2fd,stroke:#1976d2,stroke-width:3px style D fill:#e8f5e9,stroke:#388e3c,stroke-width:2px

模式对比

模式	复杂度	LLM 调用次数	延迟	适用场景	成本
直接调用	⭐	1	低	翻译、摘要、分类	最低
Chain 链式	⭐⭐	2-5	中	多步分析、内容生成	中
RAG	⭐⭐⭐	1-2	中	知识问答、文档搜索	中
Agent	⭐⭐⭐⭐	3-20+	高	复杂任务、工具使用	最高

生产级 LLM 客户端

"""
生产级 LLM 客户端，支持重试、超时、降级
"""
import time
import hashlib
from dataclasses import dataclass, field
from enum import Enum
class ModelTier(Enum):
PRIMARY = "primary"
FALLBACK = "fallback"
BUDGET = "budget"
@dataclass
class ModelConfig:
"""模型配置"""
name: str
tier: ModelTier
max_tokens: int = 4096
temperature: float = 0.7
timeout_seconds: float = 30.0
cost_per_1k_input: float = 0.0
cost_per_1k_output: float = 0.0
@dataclass
class LLMResponse:
"""LLM 响应"""
content: str
model: str
input_tokens: int
output_tokens: int
latency_ms: float
from_cache: bool = False
@dataclass
class RetryConfig:
"""重试配置"""
max_retries: int = 3
base_delay: float = 1.0
max_delay: float = 30.0
retry_on: tuple = (429, 500, 502, 503, 529)
class ProductionLLMClient:
"""生产级 LLM 客户端"""
def __init__(
self,
models: list[ModelConfig],
retry_config: RetryConfig | None = None,
cache: dict | None = None,
):
self.models = sorted(models, key=lambda m: m.tier.value)
self.retry_config = retry_config or RetryConfig()
self.cache = cache if cache is not None else {}
def generate(self, prompt: str, system: str = "", **kwargs) -> LLMResponse:
"""生成响应，支持缓存、重试、降级"""
# 1. 缓存查询
cache_key = self._cache_key(prompt, system)
if cache_key in self.cache:
cached = self.cache[cache_key]
return LLMResponse(
content=cached["content"],
model=cached["model"],
input_tokens=0,
output_tokens=0,
latency_ms=0,
from_cache=True,
)
# 2. 按优先级尝试模型
last_error = None
for model in self.models:
try:
response = self._call_with_retry(model, prompt, system, **kwargs)
# 存入缓存
self.cache[cache_key] = {
"content": response.content,
"model": response.model,
}
return response
except Exception as e:
last_error = e
print(f"  模型 {model.name} 失败: {e}, 尝试降级...")
continue
raise RuntimeError(f"所有模型均失败: {last_error}")
def _call_with_retry(
self, model: ModelConfig, prompt: str, system: str, **kwargs
) -> LLMResponse:
"""带重试的 API 调用"""
for attempt in range(self.retry_config.max_retries):
try:
start = time.time()
# 实际 API 调用（简化）
result = self._api_call(model, prompt, system, **kwargs)
latency = (time.time() - start) * 1000
return LLMResponse(
content=result["content"],
model=model.name,
input_tokens=result.get("input_tokens", 0),
output_tokens=result.get("output_tokens", 0),
latency_ms=latency,
)
except Exception as e:
if attempt < self.retry_config.max_retries - 1:
delay = min(
self.retry_config.base_delay * (2 ** attempt),
self.retry_config.max_delay,
)
time.sleep(delay)
else:
raise
def _api_call(self, model: ModelConfig, prompt: str, system: str, **kwargs) -> dict:
"""API 调用接口（需子类实现）"""
raise NotImplementedError
def _cache_key(self, prompt: str, system: str) -> str:
text = f"{system}||{prompt}"
return hashlib.sha256(text.encode()).hexdigest()[:16]

Chain 模式实现

"""
Chain 链式处理模式
"""
from dataclasses import dataclass, field
from typing import Callable, Any
@dataclass
class ChainStep:
"""链条步骤"""
name: str
processor: Callable[[dict], dict]
description: str = ""
class LLMChain:
"""LLM 链式处理器"""
def __init__(self, name: str):
self.name = name
self.steps: list[ChainStep] = []
def add_step(self, step: ChainStep) -> "LLMChain":
self.steps.append(step)
return self
def run(self, initial_input: dict) -> dict:
"""顺序执行所有步骤"""
context = dict(initial_input)
for i, step in enumerate(self.steps):
print(f"  [{i+1}/{len(self.steps)}] {step.name}")
context = step.processor(context)
return context
# 示例：内容生产链
def extract_topics(ctx: dict) -> dict:
ctx["topics"] = ["topic1", "topic2"]
return ctx
def generate_outline(ctx: dict) -> dict:
ctx["outline"] = [f"## {t}" for t in ctx["topics"]]
return ctx
def write_content(ctx: dict) -> dict:
ctx["content"] = "\n".join(ctx["outline"])
return ctx
chain = (
LLMChain("content-pipeline")
.add_step(ChainStep("提取主题", extract_topics))
.add_step(ChainStep("生成大纲", generate_outline))
.add_step(ChainStep("撰写内容", write_content))
)

本章小结

主题	要点
四种模式	直接调用 / Chain / RAG / Agent
生产客户端	缓存 + 重试 + 模型降级
Chain 模式	步骤化处理，每步可独立测试
选择原则	从最简单的模式开始，按需升级

下一章：Prompt 管理与版本控制