2 min read451 words

多模态系统评估

多模态评估的独特挑战

评估一个多模态系统比评估纯文本 LLM 复杂得多——需要同时验证视觉理解、文本生成、跨模态一致性和用户体验。

graph TB A[多模态评估维度] --> B[视觉理解] A --> C[文本质量] A --> D[跨模态一致性] A --> E[安全性] A --> F[用户体验] B --> B1[目标识别准确率] B --> B2[场景理解] B --> B3[空间关系] B --> B4[文字识别 OCR] C --> C1[描述准确性] C --> C2[回答相关性] D --> D1[图文匹配度] D --> D2[信息一致性] E --> E1[内容安全] E --> E2[隐私保护] F --> F1[响应速度] F --> F2[交互自然度] style A fill:#e3f2fd,stroke:#1976d2,stroke-width:3px

核心评估指标

维度	指标	测量方法	目标值
视觉理解	目标检测 mAP	COCO/Pascal VOC	>0.75
视觉理解	OCR 字符准确率	逐字比对	>0.95
视觉理解	VQA 准确率	标准问答集	>0.80
文本质量	描述忠实度	LLM Judge	>0.85
跨模态	图文匹配 CLIPScore	CLIP 余弦相似度	>0.30
跨模态	幻觉率	不在图像中的描述比例	<5%
安全	有害图像检测	安全分类器	>0.99
性能	端到端延迟	P95 计时	<3s

"""
多模态评估框架
"""
from dataclasses import dataclass
@dataclass
class MultimodalEvalSample:
"""多模态评估样本"""
image_path: str
question: str
expected_answer: str
category: str
difficulty: str = "medium"
@dataclass
class MultimodalEvalResult:
"""多模态评估结果"""
sample_id: str
predicted: str
scores: dict
class MultimodalEvaluator:
"""多模态评估器"""
def __init__(self):
self.evaluators = {}
def add_evaluator(self, name: str, fn, weight: float = 1.0):
"""添加评估器"""
self.evaluators[name] = {"fn": fn, "weight": weight}
def evaluate_sample(self, prediction: str, reference: str, image_desc: str = "") -> dict:
"""评估单个样本"""
scores = {}
for name, config in self.evaluators.items():
score = config["fn"](prediction, reference, image_desc)
scores[name] = round(score, 4)
# 加权总分
total_weight = sum(c["weight"] for c in self.evaluators.values())
weighted = sum(
scores[n] * self.evaluators[n]["weight"] for n in scores
) / total_weight
scores["overall"] = round(weighted, 4)
return scores
def evaluate_dataset(self, results: list[tuple[str, str, str]]) -> dict:
"""评估完整数据集"""
all_scores = []
for pred, ref, img_desc in results:
scores = self.evaluate_sample(pred, ref, img_desc)
all_scores.append(scores)
# 汇总
metrics = list(self.evaluators.keys()) + ["overall"]
summary = {}
for metric in metrics:
values = [s[metric] for s in all_scores]
summary[metric] = {
"mean": round(sum(values) / len(values), 4),
"min": round(min(values), 4),
"max": round(max(values), 4),
}
return summary
# 视觉幻觉检测
class VisualHallucinationDetector:
"""视觉幻觉检测器"""
def __init__(self):
self.hallucination_patterns = [
"可以看到",  # 声称看到图中没有的东西
"图中显示",
"画面中有",
]
def detect(self, response: str, image_objects: list[str]) -> dict:
"""检测视觉幻觉"""
claims = self._extract_visual_claims(response)
hallucinated = []
for claim in claims:
if not any(obj.lower() in claim.lower() for obj in image_objects):
hallucinated.append(claim)
return {
"total_claims": len(claims),
"hallucinated": len(hallucinated),
"hallucination_rate": len(hallucinated) / len(claims) if claims else 0,
"examples": hallucinated[:3],
}
def _extract_visual_claims(self, text: str) -> list[str]:
"""提取视觉声明"""
claims = []
for pattern in self.hallucination_patterns:
if pattern in text:
# 提取包含该模式的句子
sentences = text.split("。")
for s in sentences:
if pattern in s:
claims.append(s.strip())
return claims
# 评估基准数据集
BENCHMARK_DATASETS = [
{
"name": "MMBench",
"规模": "3,000+ 样本",
"能力覆盖": "感知、推理、知识",
"语言": "中英双语",
},
{
"name": "MMMU",
"规模": "11,500 样本",
"能力覆盖": "大学水平多学科",
"语言": "英语",
},
{
"name": "MM-Vet",
"规模": "218 样本",
"能力覆盖": "集成视觉-语言能力",
"语言": "英语",
},
{
"name": "SEEDBench",
"规模": "19,000+ 样本",
"能力覆盖": "图像+视频理解",
"语言": "英语",
},
]
print("多模态基准数据集:")
for ds in BENCHMARK_DATASETS:
print(f"  {ds['name']}: {ds['规模']} | {ds['能力覆盖']} | {ds['语言']}")

生成质量评估（图像/视频）

指标	衡量内容	适用场景	工具
FID	生成图像与真实分布的距离	图像生成整体质量	torch-fidelity
CLIPScore	图文匹配度	Prompt 遵循度	open-clip
IS	生成多样性 + 清晰度	生成模型对比	torch-fidelity
LPIPS	感知相似度	图像编辑/变换	lpips
人工评估	主观质量偏好	最终产品验收	标注平台

本章小结

多模态评估需覆盖视觉理解、文本质量、跨模态一致性、安全性和用户体验五大维度
视觉幻觉（模型声称看到图中不存在的内容）是多模态特有的问题
MMBench、MMMU 等基准提供了标准化的多模态评估方法
生成类任务使用 FID、CLIPScore 等指标衡量质量
最终产品质量仍需人工评估把关

延伸阅读：参考 LLM 评估与测试指南，了解更全面的 AI 评估方法论。