1 min read288 words

目标检测与图像分类

图像理解有两个基础任务：分类（"这是什么？"）和检测（"在哪里？是什么？"）。掌握这两项能力，是构建视觉 AI 应用的基石。

任务类型对比

graph LR A[图像理解任务] --> B[图像分类
Classification] A --> C[目标检测
Object Detection] A --> D[实例分割
Instance Segmentation] A --> E[关键点检测
Keypoint Detection] B --> B1[输出: 类别标签
ResNet / EfficientNet] C --> C1[输出: 边界框 + 类别
YOLO / DETR] D --> D1[输出: 像素级掩码
Mask R-CNN / SAM] E --> E1[输出: 骨架关键点
MediaPipe / OpenPose] style C fill:#e3f2fd,stroke:#1565c0,stroke-width:2px style D fill:#ede7f6,stroke:#5e35b1,stroke-width:2px

YOLO 目标检测实战

from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
import json
@dataclass
class BoundingBox:
"""边界框（归一化坐标 0–1）"""
x_center: float
y_center: float
width: float
height: float
def to_pixel(self, img_w: int, img_h: int) -> tuple[int, int, int, int]:
"""转换为像素坐标 (x1, y1, x2, y2)"""
x1 = int((self.x_center - self.width / 2) * img_w)
y1 = int((self.y_center - self.height / 2) * img_h)
x2 = int((self.x_center + self.width / 2) * img_w)
y2 = int((self.y_center + self.height / 2) * img_h)
return x1, y1, x2, y2
def iou(self, other: "BoundingBox") -> float:
"""计算 Intersection over Union"""
# 转换为角点坐标（归一化）
def corners(box):
return (
box.x_center - box.width / 2,
box.y_center - box.height / 2,
box.x_center + box.width / 2,
box.y_center + box.height / 2,
)
ax1, ay1, ax2, ay2 = corners(self)
bx1, by1, bx2, by2 = corners(other)
inter_x1 = max(ax1, bx1)
inter_y1 = max(ay1, by1)
inter_x2 = min(ax2, bx2)
inter_y2 = min(ay2, by2)
if inter_x2 < inter_x1 or inter_y2 < inter_y1:
return 0.0
inter_area = (inter_x2 - inter_x1) * (inter_y2 - inter_y1)
area_a = self.width * self.height
area_b = other.width * other.height
union_area = area_a + area_b - inter_area
return inter_area / max(union_area, 1e-8)
@dataclass
class Detection:
"""单个检测结果"""
class_id: int
class_name: str
confidence: float
bbox: BoundingBox
@property
def is_confident(self) -> bool:
return self.confidence >= 0.5
@dataclass
class DetectionResult:
"""一张图的所有检测结果"""
image_path: str
detections: list[Detection] = field(default_factory=list)
inference_time_ms: int = 0
def filter_by_class(self, class_name: str) -> list[Detection]:
return [d for d in self.detections if d.class_name == class_name]
def filter_by_confidence(self, min_conf: float = 0.5) -> list[Detection]:
return [d for d in self.detections if d.confidence >= min_conf]
def to_dict(self) -> dict:
return {
"image": self.image_path,
"count": len(self.detections),
"detections": [
{
"class": d.class_name,
"confidence": round(d.confidence, 3),
"bbox": {
"x": d.bbox.x_center,
"y": d.bbox.y_center,
"w": d.bbox.width,
"h": d.bbox.height,
},
}
for d in self.detections
],
}
class YOLODetector:
"""
YOLOv8 目标检测封装
依赖: pip install ultralytics
"""
def __init__(self, model_path: str = "yolov8n.pt", conf_threshold: float = 0.5):
self.model_path = model_path
self.conf_threshold = conf_threshold
self._model = None
def _load(self):
if self._model is None:
from ultralytics import YOLO
self._model = YOLO(self.model_path)
def detect(self, image_path: str) -> DetectionResult:
"""对单张图片执行目标检测"""
import time
self._load()
start = time.time()
results = self._model(image_path, conf=self.conf_threshold, verbose=False)
elapsed_ms = int((time.time() - start) * 1000)
detections = []
for result in results:
for box in result.boxes:
cls_id = int(box.cls[0])
detections.append(Detection(
class_id=cls_id,
class_name=result.names[cls_id],
confidence=float(box.conf[0]),
bbox=BoundingBox(
x_center=float(box.xywhn[0][0]),
y_center=float(box.xywhn[0][1]),
width=float(box.xywhn[0][2]),
height=float(box.xywhn[0][3]),
),
))
return DetectionResult(
image_path=image_path,
detections=detections,
inference_time_ms=elapsed_ms,
)
def batch_detect(self, image_paths: list[str]) -> list[DetectionResult]:
"""批量检测"""
return [self.detect(p) for p in image_paths]
# 使用示例
detector = YOLODetector(model_path="yolov8m.pt", conf_threshold=0.6)
# result = detector.detect("product_shelf.jpg")
# print(f"检测到 {len(result.detections)} 个目标")
# for det in result.filter_by_class("bottle"):
#     print(f"  瓶子: {det.confidence:.2f} @ {det.bbox}")
# 演示 IoU 计算
box_a = BoundingBox(0.5, 0.5, 0.4, 0.4)
box_b = BoundingBox(0.6, 0.6, 0.4, 0.4)
print(f"IoU (A, B) = {box_a.iou(box_b):.3f}")

模型选型对比

模型	速度	精度	大小	最适场景
YOLOv8n	极快	低	6MB	实时边缘推理
YOLOv8m	快	中	50MB	均衡生产部署
YOLOv8x	慢	高	131MB	高精度离线处理
DETR	中	高	166MB	复杂场景理解
SAM	慢	极高	358MB	精细分割

本章小结

YOLOv8 是生产环境的首选——速度/精度均衡，社区活跃
IoU 是检测质量的核心指标——NMS 去重和 mAP 计算都依赖 IoU
置信度阈值要针对场景调整——安全场景用 0.7+，探索场景可降至 0.3
批量推理比逐张快 3–5x——合理组织批次大小（建议 8–32）
SAM 用于精细分割，不用于实时检测——模型大小决定使用场景

下一章：语音AI技术