多模态 API 架构设计
当多模态功能需要对外暴露为服务时,API 的设计质量直接影响调用方的使用体验和系统的可维护性。本章聚焦多模态 API 的架构模式、接口规范和性能优化。
多模态 API 架构模式
graph TB
A[多模态 API 架构] --> B[单一端点模式
Unified Endpoint] A --> C[分模态端点模式
Per-Modality] A --> D[流式响应模式
Streaming] A --> E[异步任务模式
Async Job] B --> B1[优: 简单
缺: 难以优化] C --> C1[优: 独立扩缩容
缺: 调用分散] D --> D1[优: 低感知延迟
缺: 实现复杂] E --> E1[优: 适合长任务
缺: 轮询开销] style A fill:#ede7f6,stroke:#5e35b1,stroke-width:2px style D fill:#e3f2fd,stroke:#1565c0,stroke-width:2px style E fill:#c8e6c9,stroke:#43a047,stroke-width:2px
Unified Endpoint] A --> C[分模态端点模式
Per-Modality] A --> D[流式响应模式
Streaming] A --> E[异步任务模式
Async Job] B --> B1[优: 简单
缺: 难以优化] C --> C1[优: 独立扩缩容
缺: 调用分散] D --> D1[优: 低感知延迟
缺: 实现复杂] E --> E1[优: 适合长任务
缺: 轮询开销] style A fill:#ede7f6,stroke:#5e35b1,stroke-width:2px style D fill:#e3f2fd,stroke:#1565c0,stroke-width:2px style E fill:#c8e6c9,stroke:#43a047,stroke-width:2px
统一多模态 API 实现
from dataclasses import dataclass, field
from enum import Enum
from typing import Any, Iterator
import time
import uuid
import json
class ModalityType(Enum):
TEXT = "text"
IMAGE_URL = "image_url"
AUDIO_URL = "audio_url"
FILE_URL = "file_url"
@dataclass
class ContentPart:
"""多模态内容片段(对齐 OpenAI 消息格式)"""
type: ModalityType
content: str # 文本内容 or URL
detail: str = "auto" # 图像解析精度: auto / low / high
def to_api_format(self) -> dict:
if self.type == ModalityType.TEXT:
return {"type": "text", "text": self.content}
elif self.type == ModalityType.IMAGE_URL:
return {
"type": "image_url",
"image_url": {"url": self.content, "detail": self.detail},
}
else:
return {"type": self.type.value, "url": self.content}
@dataclass
class MultimodalMessage:
"""多模态消息"""
role: str # "user" / "assistant" / "system"
parts: list[ContentPart] = field(default_factory=list)
def add_text(self, text: str) -> "MultimodalMessage":
self.parts.append(ContentPart(ModalityType.TEXT, text))
return self
def add_image(self, url: str, detail: str = "auto") -> "MultimodalMessage":
self.parts.append(ContentPart(ModalityType.IMAGE_URL, url, detail))
return self
def add_audio(self, url: str) -> "MultimodalMessage":
self.parts.append(ContentPart(ModalityType.AUDIO_URL, url))
return self
def to_api_format(self) -> dict:
if len(self.parts) == 1 and self.parts[0].type == ModalityType.TEXT:
return {"role": self.role, "content": self.parts[0].content}
return {
"role": self.role,
"content": [p.to_api_format() for p in self.parts],
}
@dataclass
class APIResponse:
"""标准化 API 响应"""
request_id: str
content: str
input_tokens: int
output_tokens: int
latency_ms: int
model: str
finish_reason: str = "stop"
def to_dict(self) -> dict:
return {
"id": self.request_id,
"model": self.model,
"content": self.content,
"usage": {
"input_tokens": self.input_tokens,
"output_tokens": self.output_tokens,
"total_tokens": self.input_tokens + self.output_tokens,
},
"latency_ms": self.latency_ms,
"finish_reason": self.finish_reason,
}
class MultimodalAPIClient:
"""多模态 API 客户端(带缓存、重试、成本追踪)"""
INPUT_COST_PER_1M_TOKENS = {
"gpt-4o": 5.00,
"gpt-4o-mini": 0.15,
"claude-3-5-sonnet": 3.00,
"gemini-1.5-pro": 3.50,
}
OUTPUT_COST_PER_1M_TOKENS = {
"gpt-4o": 15.00,
"gpt-4o-mini": 0.60,
"claude-3-5-sonnet": 15.00,
"gemini-1.5-pro": 10.50,
}
def __init__(self, default_model: str = "gpt-4o-mini"):
self.default_model = default_model
self._total_cost = 0.0
self._request_count = 0
def estimate_cost(
self,
model: str,
input_tokens: int,
output_tokens: int
) -> float:
"""估算调用成本"""
in_cost = self.INPUT_COST_PER_1M_TOKENS.get(model, 5.0) * input_tokens / 1_000_000
out_cost = self.OUTPUT_COST_PER_1M_TOKENS.get(model, 15.0) * output_tokens / 1_000_000
return round(in_cost + out_cost, 6)
def complete(
self,
messages: list[MultimodalMessage],
model: str | None = None,
max_tokens: int = 1024,
temperature: float = 0.7,
stream: bool = False,
) -> APIResponse:
"""执行多模态补全请求"""
model = model or self.default_model
start_ms = int(time.time() * 1000)
# 转换为 API 格式
api_messages = [m.to_api_format() for m in messages]
# 实际调用(示意)
# from openai import OpenAI
# client = OpenAI()
# resp = client.chat.completions.create(
# model=model,
# messages=api_messages,
# max_tokens=max_tokens,
# temperature=temperature,
# )
elapsed = int(time.time() * 1000) - start_ms
self._request_count += 1
# 模拟返回
return APIResponse(
request_id=str(uuid.uuid4())[:8],
content="[模拟: 多模态 API 响应]",
input_tokens=len(str(api_messages)) // 4,
output_tokens=50,
latency_ms=elapsed,
model=model,
)
def complete_with_image(
self,
text_query: str,
image_url: str,
model: str | None = None,
) -> APIResponse:
"""便捷方法: 图文混合查询"""
message = (
MultimodalMessage(role="user")
.add_image(image_url, detail="auto")
.add_text(text_query)
)
return self.complete([message], model=model)
@property
def usage_stats(self) -> dict:
return {
"total_requests": self._request_count,
"total_cost_usd": round(self._total_cost, 4),
}
# 使用示例
client = MultimodalAPIClient(default_model="gpt-4o-mini")
# 图文分析请求
msg = (
MultimodalMessage(role="user")
.add_text("请描述这张图片中的主要内容,以及图中的商品名称和价格标签。")
.add_image("https://example.com/product_shelf.jpg", detail="high")
)
response = client.complete([msg], max_tokens=256)
print(f"请求 ID: {response.request_id}")
print(f"Token 使用: {response.input_tokens} in / {response.output_tokens} out")
cost = client.estimate_cost("gpt-4o-mini", response.input_tokens, response.output_tokens)
print(f"预计成本: ${cost:.6f}")
API 设计最佳实践
| 设计决策 | 推荐方案 | 原因 |
|---|---|---|
| 请求 ID | 每次请求生成 UUID | 方便日志追踪 |
| 图片传递 | URL 优于 Base64 | Base64 体积大 3x,增加网络开销 |
| 流式输出 | 长文本/代码生成场景必用 | 用户感知延迟降低 60%+ |
| 错误格式 | 统一 {error: {code, message}} | 调用方错误处理更简单 |
| 版本管理 | URL 路径版本 /v1/ /v2/ | 不破坏现有调用方 |
| 速率限制 | 按 token 限制,不按请求次数 | 更公平,防止大请求滥用 |
本章小结
- ContentPart 模式統一多模态输入——文本、图像、音频结构相同
- URL 传图比 Base64 更高效——尤其在批量场景下
- 成本追踪在 Client 层实现——每次调用都记录 token 和费用
- 流式响应对用户体验至关重要——长输出场景必须支持
- 统一错误格式——调用方无需为不同模型的错误写不同处理逻辑
下一章:部署优化与最佳实践