多轮对话质量检测
单轮问答的评估相对简单——质量问题在多轮对话中才会全面暴露:上下文遗忘、角色漂移、立场矛盾、话题跑偏……
多轮对话质量维度
graph TB
A[多轮对话质量] --> B[上下文一致性
Context Consistency] A --> C[角色稳定性
Persona Stability] A --> D[指令遵从
Instruction Following] A --> E[话题连贯性
Topic Coherence] A --> F[内存利用率
Memory Utilization] B --> B1[不矛盾之前的陈述] C --> C1[人设不漂移] D --> D1[遵从用户约束条件] E --> E1[不随意转移话题] F --> F1[有效利用对话历史] style A fill:#ede7f6,stroke:#5e35b1,stroke-width:2px style B fill:#ffcdd2,stroke:#c62828,stroke-width:2px style F fill:#e3f2fd,stroke:#1565c0,stroke-width:2px
Context Consistency] A --> C[角色稳定性
Persona Stability] A --> D[指令遵从
Instruction Following] A --> E[话题连贯性
Topic Coherence] A --> F[内存利用率
Memory Utilization] B --> B1[不矛盾之前的陈述] C --> C1[人设不漂移] D --> D1[遵从用户约束条件] E --> E1[不随意转移话题] F --> F1[有效利用对话历史] style A fill:#ede7f6,stroke:#5e35b1,stroke-width:2px style B fill:#ffcdd2,stroke:#c62828,stroke-width:2px style F fill:#e3f2fd,stroke:#1565c0,stroke-width:2px
多轮对话评估器实现
from dataclasses import dataclass, field
from typing import Optional
import re
@dataclass
class TurnMessage:
"""单轮消息"""
role: str # "user" or "assistant"
content: str
turn_id: int
@dataclass
class ConversationSession:
"""完整对话会话"""
session_id: str
turns: list[TurnMessage] = field(default_factory=list)
def add_turn(self, role: str, content: str) -> None:
turn_id = len(self.turns)
self.turns.append(TurnMessage(role, content, turn_id))
def assistant_turns(self) -> list[TurnMessage]:
return [t for t in self.turns if t.role == "assistant"]
def user_turns(self) -> list[TurnMessage]:
return [t for t in self.turns if t.role == "user"]
class ConversationEvaluator:
"""多轮对话质量评估器"""
def evaluate_context_consistency(self, session: ConversationSession) -> float:
"""
检测上下文一致性:扫描助手回复中是否出现互相矛盾的陈述
返回 0.0(严重矛盾)–1.0(完全一致)
真实实现中使用 NLI 模型检测矛盾
"""
assistant_turns = session.assistant_turns()
if len(assistant_turns) < 2:
return 1.0
contradiction_count = 0
total_pairs = 0
for i in range(len(assistant_turns)):
for j in range(i + 1, len(assistant_turns)):
total_pairs += 1
# 示意:检测明显矛盾词对
text_i = assistant_turns[i].content.lower()
text_j = assistant_turns[j].content.lower()
contradiction_signals = [
("是" in text_i and "否" in text_j and
any(kw in text_i and kw in text_j
for kw in ["可以", "支持", "允许"])),
]
if any(contradiction_signals):
contradiction_count += 1
if total_pairs == 0:
return 1.0
return max(0.0, 1.0 - contradiction_count / total_pairs)
def evaluate_memory_utilization(self, session: ConversationSession) -> float:
"""
检测内存利用率:模型是否在后续回复中引用了早期对话信息
"""
assistant_turns = session.assistant_turns()
user_turns = session.user_turns()
if len(assistant_turns) < 2 or len(user_turns) < 2:
return 1.0
# 收集用户提供的关键信息(名字、数字、关键词)
key_facts: list[str] = []
for turn in user_turns[:-1]: # 排除最后一轮
# 提取数字和专有名词(简化示意)
numbers = re.findall(r'\b\d+\b', turn.content)
key_facts.extend(numbers)
if not key_facts:
return 1.0
# 检查最后几轮助手回复是否引用了这些信息
late_turns_text = " ".join(t.content for t in assistant_turns[-3:])
referenced = sum(1 for fact in key_facts if fact in late_turns_text)
return referenced / len(key_facts)
def evaluate_instruction_adherence(
self,
session: ConversationSession,
system_constraints: list[str],
) -> float:
"""
检测模型是否一直遵守系统级约束
system_constraints: 例如 ["回答不超过200字", "只用中文回答"]
"""
if not system_constraints:
return 1.0
violations = 0
checked = 0
for turn in session.assistant_turns():
for constraint in system_constraints:
checked += 1
# 字数限制检测
if "不超过" in constraint:
limit_match = re.search(r'不超过(\d+)字', constraint)
if limit_match:
limit = int(limit_match.group(1))
if len(turn.content) > limit:
violations += 1
# 语言约束检测(简化示意)
if "只用中文" in constraint:
english_ratio = len(re.findall(r'[a-zA-Z]', turn.content)) / max(len(turn.content), 1)
if english_ratio > 0.2:
violations += 1
if checked == 0:
return 1.0
return max(0.0, 1.0 - violations / checked)
def full_evaluation(
self,
session: ConversationSession,
system_constraints: list[str] | None = None,
) -> dict:
"""综合评估多轮对话质量"""
consistency = self.evaluate_context_consistency(session)
memory = self.evaluate_memory_utilization(session)
adherence = self.evaluate_instruction_adherence(
session, system_constraints or []
)
weighted_score = 0.4 * consistency + 0.3 * memory + 0.3 * adherence
return {
"overall_score": round(weighted_score, 3),
"context_consistency": round(consistency, 3),
"memory_utilization": round(memory, 3),
"instruction_adherence": round(adherence, 3),
"turn_count": len(session.turns),
"quality_label": "优秀" if weighted_score >= 0.8 else "良好" if weighted_score >= 0.6 else "需改进",
}
# 使用示例
evaluator = ConversationEvaluator()
session = ConversationSession("test_session_001")
session.add_turn("user", "我叫张伟,我有2个问题想问你。")
session.add_turn("assistant", "您好张伟!请问您的2个问题是什么?")
session.add_turn("user", "第一个问题:Python 和 Java 哪个更好?")
session.add_turn("assistant", "这取决于使用场景。Python 适合数据科学,Java 适合企业级后端。")
session.add_turn("user", "那第二个问题呢?")
session.add_turn("assistant", "(忘记问了第二个问题) 您还有其他问题吗?")
result = evaluator.full_evaluation(
session,
system_constraints=["只用中文回答"]
)
print(f"评估结果: {result}")
常见多轮对话缺陷分类
| 缺陷类型 | 表现 | 检测方法 | 严重程度 |
|---|---|---|---|
| 上下文遗忘 | 忘记早期提到的信息 | 关键词引用率检测 | 高 |
| 自我矛盾 | 前后陈述互相冲突 | NLI 矛盾检测 | 高 |
| 人设漂移 | 角色特征逐渐消失 | 人设关键词追踪 | 中 |
| 指令违反 | 不遵守用户设定的约束 | 约束规则检测 | 高 |
| 响应重复 | 反复输出相同内容 | 余弦相似度检测 | 中 |
| 话题跑偏 | 无关内容大量出现 | 主题一致性评分 | 低 |
本章小结
- 多轮评估≠多次单轮评估——上下文相关性是核心考察点
- 关键信息引用率——是内存利用率的简单但有效指标
- NLI 模型检测矛盾——纯规则方法难以发现语义矛盾
- 进行长对话压测——至少测试 20 轮以上才能暴露遗忘问题
- 系统提示词约束追踪——每轮都需验证约束是否被遵循
下一章:AB测试与持续监控