视频处理与分析
视频 = 图像序列 + 音频 + 时间维度。处理视频是多模态 AI 最复杂也最有价值的方向。
视频处理策略
graph TB
A[视频输入] --> B{视频长度?}
B -->|< 1分钟| C[直接发送给
Gemini/GPT-4o] B -->|1-10分钟| D[关键帧提取] B -->|> 10分钟| E[分段处理] D --> F[提取 N 帧] E --> G[切分为片段] F --> H[VLM 逐帧/批量分析] G --> I[每段提取关键帧] I --> H H --> J[汇总结果] style A fill:#e3f2fd,stroke:#1976d2,stroke-width:2px style J fill:#c8e6c9,stroke:#388e3c,stroke-width:2px
Gemini/GPT-4o] B -->|1-10分钟| D[关键帧提取] B -->|> 10分钟| E[分段处理] D --> F[提取 N 帧] E --> G[切分为片段] F --> H[VLM 逐帧/批量分析] G --> I[每段提取关键帧] I --> H H --> J[汇总结果] style A fill:#e3f2fd,stroke:#1976d2,stroke-width:2px style J fill:#c8e6c9,stroke:#388e3c,stroke-width:2px
关键帧提取
"""
视频关键帧提取策略
"""
import math
from dataclasses import dataclass
@dataclass
class VideoFrame:
"""视频帧"""
frame_index: int
timestamp_sec: float
is_keyframe: bool = False
scene_change: bool = False
class KeyFrameExtractor:
"""关键帧提取器"""
def __init__(self, strategy: str = "uniform"):
"""
strategy:
- uniform: 均匀采样
- scene_change: 场景变化检测
- smart: 智能选择(结合均匀+场景变化)
"""
self.strategy = strategy
def uniform_sample(
self,
total_frames: int,
fps: float,
target_frames: int = 10,
) -> list[VideoFrame]:
"""均匀采样关键帧"""
interval = max(1, total_frames // target_frames)
frames = []
for i in range(0, total_frames, interval):
frames.append(VideoFrame(
frame_index=i,
timestamp_sec=round(i / fps, 2),
is_keyframe=True,
))
return frames[:target_frames]
def time_based_sample(
self,
duration_sec: float,
fps: float,
interval_sec: float = 5.0,
) -> list[VideoFrame]:
"""按时间间隔采样"""
frames = []
total_frames = int(duration_sec * fps)
t = 0
while t < duration_sec:
frame_idx = int(t * fps)
frames.append(VideoFrame(
frame_index=frame_idx,
timestamp_sec=round(t, 2),
is_keyframe=True,
))
t += interval_sec
return frames
def estimate_cost(
self,
num_frames: int,
model: str = "gpt-4o",
) -> dict:
"""估算视频分析成本"""
# 每帧约 85 tokens (low) 或 1105 tokens (high)
detail_costs = {
"low": 85,
"high": 1105,
}
pricing = {
"gpt-4o": 2.50,
"gpt-4o-mini": 0.15,
"gemini-2.0-flash": 0.10,
}
results = {}
for detail, tokens_per_frame in detail_costs.items():
total_tokens = tokens_per_frame * num_frames + 200
price = pricing.get(model, 2.50)
cost = total_tokens / 1_000_000 * price
results[detail] = {
"tokens": total_tokens,
"cost": f"${cost:.4f}",
}
return results
# 使用
extractor = KeyFrameExtractor()
# 30秒视频,30fps
frames = extractor.uniform_sample(
total_frames=900,
fps=30,
target_frames=10,
)
print(f"提取 {len(frames)} 个关键帧:")
for f in frames:
print(f" 帧 #{f.frame_index} @ {f.timestamp_sec}s")
# 成本估算
costs = extractor.estimate_cost(10, "gpt-4o")
print(f"\n10帧分析成本:")
for detail, info in costs.items():
print(f" {detail}: {info['tokens']} tokens, {info['cost']}")
视频理解
"""
视频理解与分析
"""
class VideoAnalyzer:
"""视频分析器"""
# Gemini 原生支持视频
# GPT-4o 通过多帧图片方式
ANALYSIS_TASKS = {
"摘要": {
"prompt": (
"这些是视频的关键帧(按时间顺序排列)。\n"
"请生成视频摘要:\n"
"1. 视频主题\n"
"2. 关键场景描述\n"
"3. 时间线总结"
),
"detail": "low",
"model": "gpt-4o-mini",
},
"内容审核": {
"prompt": (
"请审核这些视频帧是否包含以下违规内容:\n"
"- 暴力/血腥\n"
"- 色情/低俗\n"
"- 违法行为\n"
"- 危险行为\n"
"对每个检测项给出 safe/unsafe 判断和置信度。"
),
"detail": "high",
"model": "gpt-4o",
},
"动作识别": {
"prompt": (
"分析这些连续帧中发生的动作:\n"
"1. 主要人物在做什么?\n"
"2. 动作的开始和结束帧\n"
"3. 动作的类别(运动/工作/社交等)"
),
"detail": "low",
"model": "gpt-4o",
},
}
def analyze(
self,
frame_paths: list[str],
task: str = "摘要",
) -> dict:
"""分析视频关键帧"""
config = self.ANALYSIS_TASKS.get(task)
if not config:
return {"error": f"不支持的任务: {task}"}
# 构建多图消息
content = [
{"type": "text", "text": config["prompt"]},
]
for i, path in enumerate(frame_paths):
content.append({
"type": "text",
"text": f"\n[帧 {i+1}]:",
})
content.append({
"type": "image_url",
"image_url": {
"url": path,
"detail": config["detail"],
},
})
return {
"model": config["model"],
"messages": [{"role": "user", "content": content}],
"task": task,
}
# 视频生成技术
VIDEO_GEN_MODELS = {
"Sora": {
"provider": "OpenAI",
"能力": "文生视频、图生视频",
"长度": "最长 60秒",
"质量": "电影级",
"状态": "有限预览",
},
"Runway Gen-3": {
"provider": "Runway",
"能力": "文生视频、图生视频、视频扩展",
"长度": "最长 10秒",
"质量": "高",
"状态": "公开可用",
},
"Kling": {
"provider": "快手",
"能力": "文生视频、图生视频",
"长度": "最长 10秒",
"质量": "高",
"状态": "公开可用",
},
"Pika": {
"provider": "Pika Labs",
"能力": "文生视频、图生视频、编辑",
"长度": "3-4秒",
"质量": "中高",
"状态": "公开可用",
},
}
print("=== 视频生成模型 ===")
for name, info in VIDEO_GEN_MODELS.items():
print(f"\n{name} ({info['provider']}):")
print(f" 能力: {info['能力']}")
print(f" 质量: {info['质量']}")
print(f" 状态: {info['状态']}")
视频处理最佳实践
| 场景 | 策略 | 帧数 | 模型 |
|---|---|---|---|
| 快速摘要 | 均匀采样 | 5-10帧 | GPT-4o-mini |
| 内容审核 | 密集采样 | 20-30帧 | GPT-4o |
| 教学视频 | 场景变化 | 10-20帧 | GPT-4o |
| 监控视频 | 异常检测 | 按事件 | 本地模型 |
| 直播审核 | 实时采样 | 1帧/5秒 | GPT-4o-mini |
本章小结
- 视频分析核心策略:关键帧提取 + VLM 分析
- 短视频(<1min)可直接用 Gemini 原生处理
- 长视频需要分段 + 关键帧 + 汇总
- 成本控制:用 low detail 做初筛,high 做精析
下一章:语音 AI 技术。