Appearance
多模态 AI 应用开发指南
多模态 AI 正在从"演示特性"进化为"生产力工具"。本指南覆盖多模态模型的 API 调用、图像/视频/音频处理、多模态 RAG 系统搭建与实际应用场景。
快速入门:选择你的多模态模型
| 模型 | 图像 | 视频 | 音频 | 文档 | API 易用性 | 定价 |
|---|---|---|---|---|---|---|
| GPT-4o | ✅ | ✅ | ✅ | ✅ | ★★★★★ | $$ |
| Gemini 2.5 Pro | ✅ | ✅ | ✅ | ✅ | ★★★★☆ | $$ |
| Claude 3.5 Sonnet | ✅ | ❌ | ❌ | ✅ | ★★★★★ | $$ |
| Qwen-VL | ✅ | ❌ | ❌ | ✅ | ★★★★☆ | ¥ |
| LLaVA | ✅ | ❌ | ❌ | ❌ | ★★★☆☆ | 免费 |
第一章:图像理解与处理
GPT-4o Vision API 实战
python
import openai
client = openai.OpenAI()
# 图像理解
with open("image.jpg", "rb") as f:
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "请描述这张图片中的内容,并列出关键信息。"},
{"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,..."}}
]
}
]
)
print(response.choices[0].message.content)图像分析常见任务
python
# 1. OCR / 文字提取
def extract_text_from_image(image_path):
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "请提取这张图片中的所有文字,保持原有排版格式。"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode_image(image_path)}"}}
]
}
]
)
return response.choices[0].message.content
# 2. 表格解析
def parse_table_from_image(image_path):
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "请将这张图片中的表格转换为 Markdown 格式。"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode_image(image_path)}"}}
]
}
]
)
return response.choices[0].message.content
# 3. 图像分类与标签
def classify_image(image_path, categories):
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": f"请将这张图片分类到以下类别之一:{categories}。只返回类别名称。"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode_image(image_path)}"}}
]
}
]
)
return response.choices[0].message.content多图片对比分析
python
def compare_images(image_paths, question):
"""多图片对比分析"""
content = [{"type": "text", "text": question}]
for path in image_paths:
content.append({
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{encode_image(path)}"}
})
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": content}]
)
return response.choices[0].message.content
# 使用示例
result = compare_images(
["product_v1.jpg", "product_v2.jpg"],
"请比较这两个产品设计的差异,分析各自的优缺点。"
)第二章:视频分析与处理
视频理解工作流
python
import cv2
import base64
def analyze_video(video_path, sample_interval=5):
"""
抽取视频帧并进行多模态分析
sample_interval: 每几秒抽取一帧
"""
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
frame_interval = int(fps * sample_interval)
frames = []
frame_count = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if frame_count % frame_interval == 0:
# 编码帧为 base64
_, buffer = cv2.imencode('.jpg', frame)
frame_base64 = base64.b64encode(buffer).decode('utf-8')
frames.append(frame_base64)
frame_count += 1
cap.release()
# 对每个关键帧进行分析
analyses = []
for i, frame in enumerate(frames):
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": f"视频第 {i*sample_interval} 秒的画面分析:"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}}
]
}
]
)
analyses.append({
"timestamp": i * sample_interval,
"analysis": response.choices[0].message.content
})
return analyses视频内容摘要
python
def summarize_video(video_path):
"""生成视频内容摘要"""
# 1. 提取关键帧
keyframes = extract_keyframes(video_path, num_frames=10)
# 2. 分析每个关键帧
frame_descriptions = []
for frame in keyframes:
desc = analyze_frame(frame)
frame_descriptions.append(desc)
# 3. 生成整体摘要
summary_prompt = f"""
基于以下视频关键帧描述,生成一个简洁的视频摘要:
{'\n'.join(frame_descriptions)}
请包含:主题、关键事件、人物、场景。
"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": summary_prompt}]
)
return response.choices[0].message.content第三章:音频处理
音频转文本 + 分析
python
import whisper
def transcribe_and_analyze(audio_path):
"""音频转文本并进行多模态分析"""
# 1. 语音识别
model = whisper.load_model("base")
result = model.transcribe(audio_path)
transcript = result["text"]
# 2. 文本分析
analysis = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": f"""
请分析以下音频转文本:
{transcript}
请提供:
1. 摘要
2. 关键点
3. 行动项
4. 情感分析
"""
}
]
)
return {
"transcript": transcript,
"analysis": analysis.choices[0].message.content
}第四章:多模态 RAG 系统
架构设计
用户查询
│
├─── 文本检索 (Dense + Sparse)
├─── 图像检索 (CLIP Embedding)
└─── 视频检索 (关键帧 Embedding)
│
└─── 多模态重排 (Reranker)
│
└─── 多模态上下文组装
│
└─── LLM 生成实现示例
python
from sentence_transformers import SentenceTransformer
import chromadb
from PIL import Image
import torch
class MultimodalRAG:
def __init__(self):
# 文本 embedding
self.text_encoder = SentenceTransformer('BAAI/bge-large-zh-v1.5')
# 图像 embedding (CLIP)
self.image_encoder = SentenceTransformer('clip-ViT-B-32')
# 向量数据库
self.db = chromadb.Client()
self.collection = self.db.create_collection("multimodal_docs")
def add_document(self, doc_id, text=None, image_path=None):
"""添加多模态文档"""
embeddings = []
metadata = {"doc_id": doc_id}
if text:
text_emb = self.text_encoder.encode(text)
embeddings.append(text_emb)
metadata["text"] = text
if image_path:
image = Image.open(image_path)
image_emb = self.image_encoder.encode(image)
embeddings.append(image_emb)
metadata["image_path"] = image_path
# 平均嵌入向量
if embeddings:
combined_emb = np.mean(embeddings, axis=0)
self.collection.add(
embeddings=[combined_emb.tolist()],
metadatas=[metadata],
ids=[doc_id]
)
def query(self, query_text=None, query_image=None, top_k=5):
"""多模态查询"""
query_embeddings = []
if query_text:
text_emb = self.text_encoder.encode(query_text)
query_embeddings.append(text_emb)
if query_image:
image = Image.open(query_image)
image_emb = self.image_encoder.encode(image)
query_embeddings.append(image_emb)
combined_query = np.mean(query_embeddings, axis=0)
results = self.collection.query(
query_embeddings=[combined_query.tolist()],
n_results=top_k
)
return results多模态生成
python
def multimodal_rag_generate(query, retrieved_docs):
"""基于检索结果的多模态生成"""
# 构建上下文
context_parts = []
images = []
for doc in retrieved_docs:
if "text" in doc:
context_parts.append(doc["text"])
if "image_path" in doc:
images.append(doc["image_path"])
context = "\n\n".join(context_parts)
# 构建消息
content = [
{"type": "text", "text": f"基于以下信息回答问题:{query}\n\n参考信息:\n{context}"}
]
for img_path in images[:3]: # 最多 3 张图
content.append({
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{encode_image(img_path)}"}
})
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": content}]
)
return response.choices[0].message.content第五章:实际应用场景
场景 1:智能客服机器人
python
class SmartCustomerService:
def __init__(self):
self.rag = MultimodalRAG()
def handle_message(self, user_message, attachments=None):
"""处理用户消息,支持文本、图片、文档"""
# 检索知识库
query_parts = [user_message]
if attachments:
for att in attachments:
if att.type == "image":
# 图片 OCR
text = extract_text_from_image(att.content)
query_parts.append(text)
# 检索
results = self.rag.query(query_text=" ".join(query_parts))
# 生成回复
response = multimodal_rag_generate(user_message, results)
return response场景 2:智能文档审核
python
def review_document(document_path):
"""多模态文档审核:文本 + 图表 + 格式"""
# 提取文档内容
text_content = extract_text(document_path)
images = extract_images(document_path)
tables = extract_tables(document_path)
# 分析
review_prompt = f"""
请审核以下文档,检查:
1. 事实准确性
2. 数据一致性(文本 vs 图表)
3. 格式规范
4. 漏洞和风险
文本内容:
{text_content[:4000]}
"""
content = [{"type": "text", "text": review_prompt}]
# 添加图表分析
for img in images[:5]:
content.append({
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{encode_image(img)}"}
})
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": content}]
)
return response.choices[0].message.content场景 3:教育辅导系统
python
class TutoringSystem:
def __init__(self):
self.subjects = ["数学", "物理", "化学"]
def explain_problem(self, problem_image, subject="数学"):
"""通过图片解释题目"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": f"你是一个{subject}老师。请详细解释这道题的解题步骤。"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode_image(problem_image)}"}}
]
}
]
)
return response.choices[0].message.content第六章:性能优化与成本控制
成本优化策略
| 策略 | 效果 | 实施难度 |
|---|---|---|
| 图片压缩 | 减少 50-70% token | 低 |
| 分辨率调整 | 低分辨率 = 更少 token | 低 |
| 批量处理 | 减少 API 调用次数 | 中 |
| 缓存策略 | 避免重复分析 | 中 |
| 模型选择 | 小模型处理简单任务 | 低 |
图片压缩工具
python
from PIL import Image
import io
def optimize_image(image_path, max_size=1024, quality=85):
"""优化图片以减少 API token 消耗"""
img = Image.open(image_path)
# 等比缩放
img.thumbnail((max_size, max_size), Image.LANCZOS)
# 压缩
buffer = io.BytesIO()
img.save(buffer, format="JPEG", quality=quality, optimize=True)
return buffer.getvalue()相关页面
- Multimodal Models — 多模态模型概念
- 多模态模型对比 — 多模态模型对比
- RAG 系统搭建入门指南 — RAG 系统搭建指南
- AI Agent 开发入门指南 — AI Agent 开发指南
- Embedding Models / Vector Representations — Embedding 模型概念
- Vector Databases — 向量数据库概念
参考来源
- OpenAI Vision API 文档
- Gemini Multimodal API 文档
- CLIP 论文: "Learning Transferable Visual Models From Natural Language Supervision"
- LLaVA 论文: "Visual Instruction Tuning"
- BGE Embedding 文档