1 min read219 words

视频处理与分析

视频 = 图像序列 + 音频 + 时间维度。处理视频是多模态 AI 最复杂也最有价值的方向。

视频处理策略

graph TB A[视频输入] --> B{视频长度?} B -->|< 1分钟| C[直接发送给
Gemini/GPT-4o] B -->|1-10分钟| D[关键帧提取] B -->|> 10分钟| E[分段处理] D --> F[提取 N 帧] E --> G[切分为片段] F --> H[VLM 逐帧/批量分析] G --> I[每段提取关键帧] I --> H H --> J[汇总结果] style A fill:#e3f2fd,stroke:#1976d2,stroke-width:2px style J fill:#c8e6c9,stroke:#388e3c,stroke-width:2px

关键帧提取

"""
视频关键帧提取策略
"""
import math
from dataclasses import dataclass
@dataclass
class VideoFrame:
"""视频帧"""
frame_index: int
timestamp_sec: float
is_keyframe: bool = False
scene_change: bool = False
class KeyFrameExtractor:
"""关键帧提取器"""
def __init__(self, strategy: str = "uniform"):
"""
strategy:
- uniform: 均匀采样
- scene_change: 场景变化检测
- smart: 智能选择（结合均匀+场景变化）
"""
self.strategy = strategy
def uniform_sample(
self,
total_frames: int,
fps: float,
target_frames: int = 10,
) -> list[VideoFrame]:
"""均匀采样关键帧"""
interval = max(1, total_frames // target_frames)
frames = []
for i in range(0, total_frames, interval):
frames.append(VideoFrame(
frame_index=i,
timestamp_sec=round(i / fps, 2),
is_keyframe=True,
))
return frames[:target_frames]
def time_based_sample(
self,
duration_sec: float,
fps: float,
interval_sec: float = 5.0,
) -> list[VideoFrame]:
"""按时间间隔采样"""
frames = []
total_frames = int(duration_sec * fps)
t = 0
while t < duration_sec:
frame_idx = int(t * fps)
frames.append(VideoFrame(
frame_index=frame_idx,
timestamp_sec=round(t, 2),
is_keyframe=True,
))
t += interval_sec
return frames
def estimate_cost(
self,
num_frames: int,
model: str = "gpt-4o",
) -> dict:
"""估算视频分析成本"""
# 每帧约 85 tokens (low) 或 1105 tokens (high)
detail_costs = {
"low": 85,
"high": 1105,
}
pricing = {
"gpt-4o": 2.50,
"gpt-4o-mini": 0.15,
"gemini-2.0-flash": 0.10,
}
results = {}
for detail, tokens_per_frame in detail_costs.items():
total_tokens = tokens_per_frame * num_frames + 200
price = pricing.get(model, 2.50)
cost = total_tokens / 1_000_000 * price
results[detail] = {
"tokens": total_tokens,
"cost": f"${cost:.4f}",
}
return results
# 使用
extractor = KeyFrameExtractor()
# 30秒视频，30fps
frames = extractor.uniform_sample(
total_frames=900,
fps=30,
target_frames=10,
)
print(f"提取 {len(frames)} 个关键帧:")
for f in frames:
print(f"  帧 #{f.frame_index} @ {f.timestamp_sec}s")
# 成本估算
costs = extractor.estimate_cost(10, "gpt-4o")
print(f"\n10帧分析成本:")
for detail, info in costs.items():
print(f"  {detail}: {info['tokens']} tokens, {info['cost']}")

视频理解

"""
视频理解与分析
"""
class VideoAnalyzer:
"""视频分析器"""
# Gemini 原生支持视频
# GPT-4o 通过多帧图片方式
ANALYSIS_TASKS = {
"摘要": {
"prompt": (
"这些是视频的关键帧（按时间顺序排列）。\n"
"请生成视频摘要：\n"
"1. 视频主题\n"
"2. 关键场景描述\n"
"3. 时间线总结"
),
"detail": "low",
"model": "gpt-4o-mini",
},
"内容审核": {
"prompt": (
"请审核这些视频帧是否包含以下违规内容：\n"
"- 暴力/血腥\n"
"- 色情/低俗\n"
"- 违法行为\n"
"- 危险行为\n"
"对每个检测项给出 safe/unsafe 判断和置信度。"
),
"detail": "high",
"model": "gpt-4o",
},
"动作识别": {
"prompt": (
"分析这些连续帧中发生的动作：\n"
"1. 主要人物在做什么？\n"
"2. 动作的开始和结束帧\n"
"3. 动作的类别（运动/工作/社交等）"
),
"detail": "low",
"model": "gpt-4o",
},
}
def analyze(
self,
frame_paths: list[str],
task: str = "摘要",
) -> dict:
"""分析视频关键帧"""
config = self.ANALYSIS_TASKS.get(task)
if not config:
return {"error": f"不支持的任务: {task}"}
# 构建多图消息
content = [
{"type": "text", "text": config["prompt"]},
]
for i, path in enumerate(frame_paths):
content.append({
"type": "text",
"text": f"\n[帧 {i+1}]:",
})
content.append({
"type": "image_url",
"image_url": {
"url": path,
"detail": config["detail"],
},
})
return {
"model": config["model"],
"messages": [{"role": "user", "content": content}],
"task": task,
}
# 视频生成技术
VIDEO_GEN_MODELS = {
"Sora": {
"provider": "OpenAI",
"能力": "文生视频、图生视频",
"长度": "最长 60秒",
"质量": "电影级",
"状态": "有限预览",
},
"Runway Gen-3": {
"provider": "Runway",
"能力": "文生视频、图生视频、视频扩展",
"长度": "最长 10秒",
"质量": "高",
"状态": "公开可用",
},
"Kling": {
"provider": "快手",
"能力": "文生视频、图生视频",
"长度": "最长 10秒",
"质量": "高",
"状态": "公开可用",
},
"Pika": {
"provider": "Pika Labs",
"能力": "文生视频、图生视频、编辑",
"长度": "3-4秒",
"质量": "中高",
"状态": "公开可用",
},
}
print("=== 视频生成模型 ===")
for name, info in VIDEO_GEN_MODELS.items():
print(f"\n{name} ({info['provider']}):")
print(f"  能力: {info['能力']}")
print(f"  质量: {info['质量']}")
print(f"  状态: {info['状态']}")

视频处理最佳实践

场景	策略	帧数	模型
快速摘要	均匀采样	5-10帧	GPT-4o-mini
内容审核	密集采样	20-30帧	GPT-4o
教学视频	场景变化	10-20帧	GPT-4o
监控视频	异常检测	按事件	本地模型
直播审核	实时采样	1帧/5秒	GPT-4o-mini

本章小结

视频分析核心策略：关键帧提取 + VLM 分析
短视频（<1min）可直接用 Gemini 原生处理
长视频需要分段 + 关键帧 + 汇总
成本控制：用 low detail 做初筛，high 做精析

下一章：语音 AI 技术。