图像生成工程实战
理解图像生成模型的原理是一回事,在生产环境稳定运行是另一回事。本章聚焦工程侧:如何封装 API、管理提示词、控制质量,以及构建批量生成 Pipeline。
图像生成技术栈
graph TB
A[图像生成需求] --> B{规模?}
B -->|小规模/原型| C[API 方案
DALL-E 3 / Stability AI] B -->|中大规模| D[自托管方案
Stable Diffusion] B -->|企业级| E[混合方案
API + 本地加速] C --> F[低成本快速上线] D --> G[高控制度
数据不出境] E --> H[平衡成本与灵活性] style C fill:#c8e6c9,stroke:#43a047,stroke-width:2px style D fill:#e3f2fd,stroke:#1565c0,stroke-width:2px style E fill:#fff9c4,stroke:#f9a825,stroke-width:2px
DALL-E 3 / Stability AI] B -->|中大规模| D[自托管方案
Stable Diffusion] B -->|企业级| E[混合方案
API + 本地加速] C --> F[低成本快速上线] D --> G[高控制度
数据不出境] E --> H[平衡成本与灵活性] style C fill:#c8e6c9,stroke:#43a047,stroke-width:2px style D fill:#e3f2fd,stroke:#1565c0,stroke-width:2px style E fill:#fff9c4,stroke:#f9a825,stroke-width:2px
生产级图像生成 Pipeline
import base64
import hashlib
import json
import time
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from typing import Optional
class ImageModel(Enum):
DALLE3 = "dall-e-3"
STABLE_DIFFUSION_XL = "sdxl"
FLUX_PRO = "flux-pro"
MIDJOURNEY = "midjourney"
class ImageResolution(Enum):
SQUARE_1024 = "1024x1024"
LANDSCAPE_1792 = "1792x1024"
PORTRAIT_1024 = "1024x1792"
@dataclass
class ImageGenerationRequest:
"""图像生成请求"""
prompt: str
negative_prompt: str = ""
model: ImageModel = ImageModel.DALLE3
resolution: ImageResolution = ImageResolution.SQUARE_1024
n_images: int = 1
quality: str = "standard" # "standard" / "hd"
style: str = "vivid" # "vivid" / "natural"
seed: Optional[int] = None
@property
def cache_key(self) -> str:
"""基于参数生成确定性缓存键"""
content = f"{self.prompt}{self.negative_prompt}{self.model.value}{self.resolution.value}{self.quality}{self.seed}"
return hashlib.md5(content.encode()).hexdigest()[:16]
@dataclass
class ImageGenerationResult:
"""图像生成结果"""
request_id: str
prompt: str
model: ImageModel
image_urls: list[str]
revised_prompt: str # DALL-E 3 会自动修改提示词
generation_time_ms: int
cost_usd: float
cached: bool = False
class ImageGenerationPipeline:
"""生产级图像生成 Pipeline"""
COST_PER_IMAGE = { # 美元/图
ImageModel.DALLE3: {"standard": 0.040, "hd": 0.080},
ImageModel.STABLE_DIFFUSION_XL: {"standard": 0.005, "hd": 0.010},
ImageModel.FLUX_PRO: {"standard": 0.055, "hd": 0.055},
}
def __init__(self, cache_dir: Path | None = None):
self.cache_dir = cache_dir or Path("./image_cache")
self.cache_dir.mkdir(parents=True, exist_ok=True)
self._cache: dict[str, ImageGenerationResult] = {}
self._total_cost = 0.0
self._request_count = 0
def _try_cache(self, request: ImageGenerationRequest) -> ImageGenerationResult | None:
"""检查本地缓存"""
cache_path = self.cache_dir / f"{request.cache_key}.json"
if cache_path.exists():
with open(cache_path, encoding="utf-8") as f:
data = json.load(f)
result = ImageGenerationResult(**data)
result.cached = True
return result
return None
def _save_cache(self, request: ImageGenerationRequest, result: ImageGenerationResult) -> None:
"""保存结果到本地缓存"""
cache_path = self.cache_dir / f"{request.cache_key}.json"
with open(cache_path, "w", encoding="utf-8") as f:
json.dump({
"request_id": result.request_id,
"prompt": result.prompt,
"model": result.model.value,
"image_urls": result.image_urls,
"revised_prompt": result.revised_prompt,
"generation_time_ms": result.generation_time_ms,
"cost_usd": result.cost_usd,
}, f, ensure_ascii=False)
def sanitize_prompt(self, prompt: str) -> str:
"""
提示词安全净化
- 过滤违规内容关键词
- 限制最大长度
- 标准化格式
"""
blocked_terms = [
"nude", "naked", "explicit", "violence", "gore", "underage"
]
prompt_lower = prompt.lower()
for term in blocked_terms:
if term in prompt_lower:
raise ValueError(f"提示词包含不允许的内容: '{term}'")
# DALL-E 3 最大 4000 字符
prompt = prompt[:4000].strip()
return prompt
def generate(self, request: ImageGenerationRequest) -> ImageGenerationResult:
"""执行图像生成(带缓存和错误处理)"""
# 尝试缓存
cached = self._try_cache(request)
if cached:
print(f"[Cache HIT] {request.cache_key}")
return cached
# 净化提示词
clean_prompt = self.sanitize_prompt(request.prompt)
start_ms = int(time.time() * 1000)
# 这里调用实际 API(此处为示意)
# import openai
# response = openai.images.generate(
# model=request.model.value,
# prompt=clean_prompt,
# size=request.resolution.value,
# quality=request.quality,
# n=request.n_images,
# )
# 模拟返回
elapsed = int(time.time() * 1000) - start_ms
cost = self.COST_PER_IMAGE.get(request.model, {}).get(request.quality, 0.04)
result = ImageGenerationResult(
request_id=f"req_{request.cache_key}",
prompt=clean_prompt,
model=request.model,
image_urls=[f"https://cdn.example.com/images/{request.cache_key}_0.png"],
revised_prompt=clean_prompt,
generation_time_ms=elapsed,
cost_usd=cost * request.n_images,
)
self._total_cost += result.cost_usd
self._request_count += 1
self._save_cache(request, result)
return result
@property
def stats(self) -> dict:
return {
"total_requests": self._request_count,
"total_cost_usd": round(self._total_cost, 4),
"avg_cost_usd": round(self._total_cost / max(self._request_count, 1), 4),
}
# 使用示例
pipeline = ImageGenerationPipeline()
req = ImageGenerationRequest(
prompt="A minimalist product photo of a wireless headphone on white background, professional studio lighting, 8K",
model=ImageModel.DALLE3,
resolution=ImageResolution.SQUARE_1024,
quality="hd",
)
result = pipeline.generate(req)
print(f"生成完成: {result.image_urls[0]}")
print(f"耗时: {result.generation_time_ms}ms, 费用: ${result.cost_usd:.3f}")
print(f"统计: {pipeline.stats}")
提示词工程对照表
| 目的 | 推荐关键词 | 示例 |
|---|---|---|
| 产品摄影 | studio lighting, white background, 8K, professional | 白底产品图 |
| 艺术插画 | digital art, concept art, detailed, trending on ArtStation | 游戏风格插画 |
| 写实照片 | photorealistic, DSLR, natural lighting, RAW photo | 真实感人物 |
| UI 设计 | flat design, minimal, clean, UI mockup, Figma style | 应用截图 |
| 负面提示词 | blurry, deformed, extra limbs, watermark, text | 通用负面提示 |
本章小结
- 缓存能大幅降低 API 成本——相同提示词的请求命中率可达 40%+
- 提示词净化是安全必须项——违规内容过滤需在调用前完成
- DALL-E 3 会重写提示词——保存 revised_prompt 用于调试
- 批量生成要控制并发——API 有速率限制,队列管理是关键
- 成本追踪——每次生成都记录费用,便于项目预算管控
下一章:目标检测与图像分类