目标检测与图像分类
图像理解有两个基础任务:分类("这是什么?")和检测("在哪里?是什么?")。掌握这两项能力,是构建视觉 AI 应用的基石。
任务类型对比
graph LR
A[图像理解任务] --> B[图像分类
Classification] A --> C[目标检测
Object Detection] A --> D[实例分割
Instance Segmentation] A --> E[关键点检测
Keypoint Detection] B --> B1[输出: 类别标签
ResNet / EfficientNet] C --> C1[输出: 边界框 + 类别
YOLO / DETR] D --> D1[输出: 像素级掩码
Mask R-CNN / SAM] E --> E1[输出: 骨架关键点
MediaPipe / OpenPose] style C fill:#e3f2fd,stroke:#1565c0,stroke-width:2px style D fill:#ede7f6,stroke:#5e35b1,stroke-width:2px
Classification] A --> C[目标检测
Object Detection] A --> D[实例分割
Instance Segmentation] A --> E[关键点检测
Keypoint Detection] B --> B1[输出: 类别标签
ResNet / EfficientNet] C --> C1[输出: 边界框 + 类别
YOLO / DETR] D --> D1[输出: 像素级掩码
Mask R-CNN / SAM] E --> E1[输出: 骨架关键点
MediaPipe / OpenPose] style C fill:#e3f2fd,stroke:#1565c0,stroke-width:2px style D fill:#ede7f6,stroke:#5e35b1,stroke-width:2px
YOLO 目标检测实战
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
import json
@dataclass
class BoundingBox:
"""边界框(归一化坐标 0–1)"""
x_center: float
y_center: float
width: float
height: float
def to_pixel(self, img_w: int, img_h: int) -> tuple[int, int, int, int]:
"""转换为像素坐标 (x1, y1, x2, y2)"""
x1 = int((self.x_center - self.width / 2) * img_w)
y1 = int((self.y_center - self.height / 2) * img_h)
x2 = int((self.x_center + self.width / 2) * img_w)
y2 = int((self.y_center + self.height / 2) * img_h)
return x1, y1, x2, y2
def iou(self, other: "BoundingBox") -> float:
"""计算 Intersection over Union"""
# 转换为角点坐标(归一化)
def corners(box):
return (
box.x_center - box.width / 2,
box.y_center - box.height / 2,
box.x_center + box.width / 2,
box.y_center + box.height / 2,
)
ax1, ay1, ax2, ay2 = corners(self)
bx1, by1, bx2, by2 = corners(other)
inter_x1 = max(ax1, bx1)
inter_y1 = max(ay1, by1)
inter_x2 = min(ax2, bx2)
inter_y2 = min(ay2, by2)
if inter_x2 < inter_x1 or inter_y2 < inter_y1:
return 0.0
inter_area = (inter_x2 - inter_x1) * (inter_y2 - inter_y1)
area_a = self.width * self.height
area_b = other.width * other.height
union_area = area_a + area_b - inter_area
return inter_area / max(union_area, 1e-8)
@dataclass
class Detection:
"""单个检测结果"""
class_id: int
class_name: str
confidence: float
bbox: BoundingBox
@property
def is_confident(self) -> bool:
return self.confidence >= 0.5
@dataclass
class DetectionResult:
"""一张图的所有检测结果"""
image_path: str
detections: list[Detection] = field(default_factory=list)
inference_time_ms: int = 0
def filter_by_class(self, class_name: str) -> list[Detection]:
return [d for d in self.detections if d.class_name == class_name]
def filter_by_confidence(self, min_conf: float = 0.5) -> list[Detection]:
return [d for d in self.detections if d.confidence >= min_conf]
def to_dict(self) -> dict:
return {
"image": self.image_path,
"count": len(self.detections),
"detections": [
{
"class": d.class_name,
"confidence": round(d.confidence, 3),
"bbox": {
"x": d.bbox.x_center,
"y": d.bbox.y_center,
"w": d.bbox.width,
"h": d.bbox.height,
},
}
for d in self.detections
],
}
class YOLODetector:
"""
YOLOv8 目标检测封装
依赖: pip install ultralytics
"""
def __init__(self, model_path: str = "yolov8n.pt", conf_threshold: float = 0.5):
self.model_path = model_path
self.conf_threshold = conf_threshold
self._model = None
def _load(self):
if self._model is None:
from ultralytics import YOLO
self._model = YOLO(self.model_path)
def detect(self, image_path: str) -> DetectionResult:
"""对单张图片执行目标检测"""
import time
self._load()
start = time.time()
results = self._model(image_path, conf=self.conf_threshold, verbose=False)
elapsed_ms = int((time.time() - start) * 1000)
detections = []
for result in results:
for box in result.boxes:
cls_id = int(box.cls[0])
detections.append(Detection(
class_id=cls_id,
class_name=result.names[cls_id],
confidence=float(box.conf[0]),
bbox=BoundingBox(
x_center=float(box.xywhn[0][0]),
y_center=float(box.xywhn[0][1]),
width=float(box.xywhn[0][2]),
height=float(box.xywhn[0][3]),
),
))
return DetectionResult(
image_path=image_path,
detections=detections,
inference_time_ms=elapsed_ms,
)
def batch_detect(self, image_paths: list[str]) -> list[DetectionResult]:
"""批量检测"""
return [self.detect(p) for p in image_paths]
# 使用示例
detector = YOLODetector(model_path="yolov8m.pt", conf_threshold=0.6)
# result = detector.detect("product_shelf.jpg")
# print(f"检测到 {len(result.detections)} 个目标")
# for det in result.filter_by_class("bottle"):
# print(f" 瓶子: {det.confidence:.2f} @ {det.bbox}")
# 演示 IoU 计算
box_a = BoundingBox(0.5, 0.5, 0.4, 0.4)
box_b = BoundingBox(0.6, 0.6, 0.4, 0.4)
print(f"IoU (A, B) = {box_a.iou(box_b):.3f}")
模型选型对比
| 模型 | 速度 | 精度 | 大小 | 最适场景 |
|---|---|---|---|---|
| YOLOv8n | 极快 | 低 | 6MB | 实时边缘推理 |
| YOLOv8m | 快 | 中 | 50MB | 均衡生产部署 |
| YOLOv8x | 慢 | 高 | 131MB | 高精度离线处理 |
| DETR | 中 | 高 | 166MB | 复杂场景理解 |
| SAM | 慢 | 极高 | 358MB | 精细分割 |
本章小结
- YOLOv8 是生产环境的首选——速度/精度均衡,社区活跃
- IoU 是检测质量的核心指标——NMS 去重和 mAP 计算都依赖 IoU
- 置信度阈值要针对场景调整——安全场景用 0.7+,探索场景可降至 0.3
- 批量推理比逐张快 3–5x——合理组织批次大小(建议 8–32)
- SAM 用于精细分割,不用于实时检测——模型大小决定使用场景
下一章:语音AI技术