1 min read144 words

多模态 RAG 与跨模态检索

传统 RAG 只检索文本——多模态 RAG 让你用文字搜图片、用图片搜文档。

多模态 RAG 架构

graph TB subgraph 输入 Q1[文本查询] Q2[图片查询] Q3[语音查询] end subgraph 编码层 E1[文本 Encoder] E2[图像 Encoder] E3[语音 Encoder] end Q1 --> E1 Q2 --> E2 Q3 --> E3 E1 --> VS[统一向量空间] E2 --> VS E3 --> VS subgraph 知识库 D1[文本文档] D2[图片/图表] D3[视频帧] D4[PDF 页面] end VS --> R[跨模态检索] R --> D1 R --> D2 R --> D3 R --> D4 D1 --> LLM[多模态 LLM] D2 --> LLM D3 --> LLM D4 --> LLM Q1 --> LLM LLM --> A[融合回答] style VS fill:#fff3e0,stroke:#f57c00,stroke-width:2px style A fill:#c8e6c9,stroke:#388e3c,stroke-width:2px

CLIP 跨模态检索

"""
CLIP: 连接文字与图片的桥梁
- 同一向量空间编码文本和图像
- 用文字搜图片、用图片搜文字
"""
from dataclasses import dataclass, field
@dataclass
class CLIPSearchEngine:
"""基于 CLIP 的跨模态搜索引擎"""
model_name: str = "openai/clip-vit-large-patch14"
image_embeddings: list = field(default_factory=list)
image_paths: list = field(default_factory=list)
def setup_code(self) -> str:
"""CLIP 搜索引擎的核心代码"""
return """
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import numpy as np
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
# 对图片编码
def encode_image(image_path: str) -> np.ndarray:
image = Image.open(image_path)
inputs = processor(images=image, return_tensors="pt")
with torch.no_grad():
features = model.get_image_features(**inputs)
# L2 归一化
features = features / features.norm(dim=-1, keepdim=True)
return features.numpy()[0]
# 对文本编码
def encode_text(text: str) -> np.ndarray:
inputs = processor(text=[text], return_tensors="pt")
with torch.no_grad():
features = model.get_text_features(**inputs)
features = features / features.norm(dim=-1, keepdim=True)
return features.numpy()[0]
# 跨模态检索：文字搜图片
def search_images(query: str, image_embeddings, top_k=5):
query_emb = encode_text(query)
similarities = np.dot(image_embeddings, query_emb)
top_indices = np.argsort(similarities)[::-1][:top_k]
return [(idx, similarities[idx]) for idx in top_indices]
"""
# 模型对比
MODELS = {
"CLIP (OpenAI)": {
"维度": 768,
"速度": "快",
"精度": "⭐⭐⭐",
"优势": "通用性强、零样本迁移",
"场景": "通用图文搜索",
},
"SigLIP (Google)": {
"维度": 768,
"速度": "快",
"精度": "⭐⭐⭐⭐",
"优势": "比 CLIP 更准确",
"场景": "需要更高精度的检索",
},
"Chinese-CLIP": {
"维度": 768,
"速度": "快",
"精度": "⭐⭐⭐⭐",
"优势": "中文理解好",
"场景": "中文图文检索",
},
"OpenAI Embeddings": {
"维度": 3072,
"速度": "中",
"精度": "⭐⭐⭐⭐⭐",
"优势": "API 调用简单",
"场景": "文本检索（无图像）",
},
}
# 比较检索方案
engine = CLIPSearchEngine()
print("=== 跨模态检索模型对比 ===")
for name, info in engine.MODELS.items():
print(f"\n{name} (维度: {info['维度']}):")
print(f"  精度: {info['精度']}")
print(f"  场景: {info['场景']}")

多模态向量数据库

"""
存储和检索多模态向量
"""
from dataclasses import dataclass
@dataclass
class VectorDBConfig:
"""向量数据库方案对比"""
DATABASES = {
"Qdrant": {
"多模态": "✅ Named Vectors",
"特点": "支持同一对象存多个向量（文本+图像）",
"示意": """
from qdrant_client import QdrantClient, models
client = QdrantClient("localhost", port=6333)
# 创建支持多向量的 collection
client.create_collection(
collection_name="multimodal",
vectors_config={
"text": models.VectorParams(size=768, distance="Cosine"),
"image": models.VectorParams(size=768, distance="Cosine"),
}
)
# 插入：一个文档包含文本和图片向量
client.upsert(
collection_name="multimodal",
points=[
models.PointStruct(
id=1,
vector={
"text": text_embedding,
"image": image_embedding,
},
payload={
"title": "产品说明书",
"image_url": "product.png",
"text": "这是一款智能手表..."
}
)
]
)
# 检索：用文本查询搜索图像向量
results = client.search(
collection_name="multimodal",
query_vector=("image", text_query_embedding),
limit=10
)
""",
},
"Weaviate": {
"多模态": "✅ Multi2Vec",
"特点": "内置多模态模块、自动编码",
"示意": """
# Weaviate 内置 CLIP 模块
# 自动将文本和图像编码到同一空间
# 配置
{
"class": "Product",
"vectorizer": "multi2vec-clip",
"moduleConfig": {
"multi2vec-clip": {
"textFields": ["description"],
"imageFields": ["image"]
}
}
}
""",
},
"ChromaDB": {
"多模态": "⚠️ 需手动编码",
"特点": "轻量、Python 原生、快速原型",
"示意": """
import chromadb
client = chromadb.Client()
collection = client.create_collection("multimodal")
# 手动编码后存入
collection.add(
ids=["img_1"],
embeddings=[clip_image_embedding],
metadatas=[{"type": "image", "path": "img.png"}]
)
# 用文本向量搜索
results = collection.query(
query_embeddings=[clip_text_embedding],
n_results=5
)
""",
},
}
@classmethod
def recommend(cls, scenario: str) -> str:
"""推荐向量数据库"""
recs = {
"生产级": "Qdrant",
"内置编码": "Weaviate",
"快速原型": "ChromaDB",
"大规模": "Qdrant",
}
return recs.get(scenario, "Qdrant")
# 对比
config = VectorDBConfig()
print("=== 多模态向量数据库 ===")
for name, info in config.DATABASES.items():
print(f"\n{name}:")
print(f"  多模态支持: {info['多模态']}")
print(f"  特点: {info['特点']}")

多模态 RAG Pipeline

"""
完整多模态 RAG 管道
"""
class MultimodalRAG:
"""多模态 RAG 系统"""
def __init__(self):
self.text_index = []
self.image_index = []
def ingest_document(self, doc_path: str) -> dict:
"""导入文档（文字 + 图片）"""
steps = [
"1. 解析文档 → 提取文本段落和嵌入图片",
"2. 文本 → text embedding (OpenAI/BGE)",
"3. 图片 → image embedding (CLIP/SigLIP)",
"4. 图片 → VLM 生成描述文本 → text embedding",
"5. 存入向量数据库（text + image 双向量）",
]
return {"steps": steps, "doc": doc_path}
def query(self, question: str, image=None) -> dict:
"""多模态查询"""
pipeline = {
"step1_encode": (
"文本查询 → text embedding; "
"图片查询 → image embedding"
),
"step2_retrieve": (
"多路检索: "
"文本→文本, 文本→图片, 图片→图片, 图片→文本"
),
"step3_rerank": "跨模态重排序 (交叉注意力)",
"step4_generate": "多模态 LLM 融合上下文生成回答",
}
return {
"question": question,
"has_image": image is not None,
"pipeline": pipeline,
}
def retrieval_strategies(self) -> dict:
"""检索策略"""
return {
"文本→文本": {
"方法": "标准语义检索",
"Encoder": "text-embedding-3-small",
"场景": "常规文本问答",
},
"文本→图片": {
"方法": "CLIP 跨模态检索",
"Encoder": "CLIP ViT-L",
"场景": "'找一张关于XX的图'",
},
"图片→图片": {
"方法": "图像相似度检索",
"Encoder": "CLIP ViT-L",
"场景": "'找类似的图片'",
},
"图片→文本": {
"方法": "CLIP 反向检索",
"Encoder": "CLIP ViT-L",
"场景": "'这张图相关的文档'",
},
"混合检索": {
"方法": "多路检索 + RRF 融合",
"Encoder": "多个",
"场景": "综合性问答",
},
}
# 演示
rag = MultimodalRAG()
print("=== 多模态 RAG Pipeline ===")
ingest = rag.ingest_document("product-manual.pdf")
for s in ingest["steps"]:
print(f"  {s}")
print("\n=== 检索策略 ===")
for name, info in rag.retrieval_strategies().items():
print(f"  {name}: {info['方法']} → {info['场景']}")

图片描述增强检索

"""
通过 VLM 为图片生成文本描述，增强检索效果
"""
class ImageCaptionEnhancer:
"""图片描述增强器"""
CAPTION_PROMPT = """请为这张图片生成详细描述，包括：
1. 主要内容/对象
2. 文字信息（如有）
3. 数据/图表的关键数据点（如有）
4. 颜色/风格特征
输出 JSON：
{
"brief": "一句话描述",
"detailed": "详细描述",
"keywords": ["关键词1", "关键词2"],
"text_content": "图中文字内容",
"data_points": {"关键数据": "值"}
}"""
@staticmethod
def enhance_retrieval() -> dict:
"""增强策略"""
return {
"双编码": "CLIP 视觉编码 + 文本描述编码",
"关键词索引": "提取关键词做倒排索引",
"OCR 补充": "提取图中文字做全文搜索",
"融合打分": "视觉相似度 × 0.6 + 文本相似度 × 0.4",
}
enhancer = ImageCaptionEnhancer()
print("=== 图片增强检索策略 ===")
for k, v in enhancer.enhance_retrieval().items():
print(f"  {k}: {v}")

本章小结

技术	用途	复杂度	效果
CLIP 检索	文字搜图片	⭐⭐	⭐⭐⭐
多向量存储	统一知识库	⭐⭐⭐	⭐⭐⭐⭐
VLM 描述增强	提高召回	⭐⭐	⭐⭐⭐⭐
多模态 RAG	融合问答	⭐⭐⭐⭐	⭐⭐⭐⭐⭐

下一章：视觉 Agent 与高级多模态应用。