多语言与多模态内容审核
High Contrast
Dark Mode
Light Mode
Sepia
Forest
2 min read363 words

多语言与多模态内容审核

随着 LLM 应用全球化部署,内容审核必须支持多语言和多模态场景。

多语言审核架构

graph TB A[用户输入] --> B[语言检测] B --> C{语言类型} C -->|中文| D[中文审核管道] C -->|英文| E[英文审核管道] C -->|日韩| F[CJK审核管道] C -->|其他| G[通用审核管道] D --> H[结果聚合] E --> H F --> H G --> H H --> I[最终裁决] style A fill:#e3f2fd,stroke:#1565c0,stroke-width:2px style B fill:#fff9c4,stroke:#f9a825,stroke-width:2px style I fill:#c8e6c9,stroke:#43a047,stroke-width:2px

多语言审核引擎

from dataclasses import dataclass, field
from enum import Enum
from typing import Optional
import re
class Language(Enum):
ZH = "zh"
EN = "en"
JA = "ja"
KO = "ko"
UNKNOWN = "unknown"
@dataclass
class ModerationResult:
language: Language
is_safe: bool
categories: list[str] = field(default_factory=list)
confidence: float = 0.0
details: str = ""
class LanguageDetector:
"""基于字符集的快速语言检测"""
CJK_RANGES = [
(0x4E00, 0x9FFF),    # CJK统一汉字
(0x3400, 0x4DBF),    # CJK扩展A
]
HIRAGANA = (0x3040, 0x309F)
KATAKANA = (0x30A0, 0x30FF)
HANGUL = (0xAC00, 0xD7AF)
def detect(self, text: str) -> Language:
"""检测文本主要语言"""
char_counts = {"zh": 0, "ja": 0, "ko": 0, "en": 0}
for ch in text:
cp = ord(ch)
if any(start <= cp <= end for start, end in self.CJK_RANGES):
char_counts["zh"] += 1
elif self.HIRAGANA[0] <= cp <= self.HIRAGANA[1] or \
self.KATAKANA[0] <= cp <= self.KATAKANA[1]:
char_counts["ja"] += 1
elif self.HANGUL[0] <= cp <= self.HANGUL[1]:
char_counts["ko"] += 1
elif ch.isascii() and ch.isalpha():
char_counts["en"] += 1
if char_counts["ja"] > 0:
return Language.JA
if char_counts["ko"] > 0:
return Language.KO
if char_counts["zh"] > char_counts["en"]:
return Language.ZH
if char_counts["en"] > 0:
return Language.EN
return Language.UNKNOWN
class MultilingualModerator:
"""多语言内容审核器"""
# 每种语言独立的关键词库
KEYWORD_BANKS: dict[Language, list[str]] = {
Language.ZH: ["暴力", "色情", "赌博", "毒品"],
Language.EN: ["violence", "explicit", "gambling", "drugs"],
Language.JA: ["暴力", "ギャンブル"],
Language.KO: ["폭력", "도박"],
}
def __init__(self):
self.detector = LanguageDetector()
def moderate(self, text: str) -> ModerationResult:
"""执行多语言审核"""
lang = self.detector.detect(text)
keywords = self.KEYWORD_BANKS.get(lang, [])
found_categories = []
text_lower = text.lower()
for keyword in keywords:
if keyword.lower() in text_lower:
found_categories.append(keyword)
return ModerationResult(
language=lang,
is_safe=len(found_categories) == 0,
categories=found_categories,
confidence=0.85 if keywords else 0.5,
details=f"Detected language: {lang.value}"
)
# 使用示例
moderator = MultilingualModerator()
result = moderator.moderate("这是一段正常的中文内容")
print(f"语言: {result.language.value}, 安全: {result.is_safe}")

多模态审核策略

graph LR A[多模态输入] --> B{模态类型} B -->|文本| C[文本审核] B -->|图片| D[图片审核] B -->|代码| E[代码审核] B -->|混合| F[交叉审核] C --> G[风险评分] D --> G E --> G F --> G G --> H{综合评分} H -->|安全| I[放行] H -->|可疑| J[人工复核] H -->|危险| K[拦截] style A fill:#e3f2fd,stroke:#1565c0,stroke-width:2px style I fill:#c8e6c9,stroke:#43a047,stroke-width:2px style J fill:#fff9c4,stroke:#f9a825,stroke-width:2px style K fill:#ffcdd2,stroke:#c62828,stroke-width:2px

模态特定审核器

from abc import ABC, abstractmethod
from dataclasses import dataclass
@dataclass
class ModalityScore:
modality: str
risk_score: float      # 0.0 ~ 1.0
categories: list[str]
requires_human: bool = False
class ModalityModerator(ABC):
@abstractmethod
def check(self, content: str) -> ModalityScore:
...
class CodeModerator(ModalityModerator):
"""代码内容安全审核"""
DANGEROUS_PATTERNS = [
(r"subprocess\.(call|run|Popen)", "命令执行"),
(r"eval\(|exec\(", "动态执行"),
(r"__import__\(", "动态导入"),
(r"os\.system\(", "系统调用"),
(r"shutil\.rmtree\(", "文件删除"),
(r"urllib\.request|requests\.get", "网络请求"),
]
def check(self, content: str) -> ModalityScore:
found = []
for pattern, category in self.DANGEROUS_PATTERNS:
if re.search(pattern, content):
found.append(category)
risk = min(len(found) * 0.3, 1.0)
return ModalityScore(
modality="code",
risk_score=risk,
categories=found,
requires_human=risk > 0.6
)
class CrossModalModerator:
"""跨模态综合审核"""
THRESHOLDS = {
"pass": 0.3,
"review": 0.7,
}
def __init__(self, moderators: dict[str, ModalityModerator]):
self.moderators = moderators
def evaluate(self, contents: dict[str, str]) -> dict:
scores = []
for modality, content in contents.items():
if modality in self.moderators:
score = self.moderators[modality].check(content)
scores.append(score)
max_risk = max((s.risk_score for s in scores), default=0.0)
if max_risk < self.THRESHOLDS["pass"]:
decision = "pass"
elif max_risk < self.THRESHOLDS["review"]:
decision = "review"
else:
decision = "block"
return {
"decision": decision,
"max_risk": max_risk,
"scores": scores,
}

多语言审核对比

维度 中文审核 英文审核 多语言混合
分词难度 高(无空格) 低(空格分隔) 极高(混合分词)
关键词库 需含变体/谐音 标准化较好 需每种语言独立维护
上下文理解 成语/典故干扰 俚语/缩写干扰 跨语言语义漂移
误判风险 中(同音字) 高(翻译歧义)
审核模型 中文专用模型 通用模型 多模型集成
合规要求 网信办规范 FTC/GDPR 需满足所有地区法规

本章小结

下一章:对抗攻击类型