2 min read469 words

注意力机制

自注意力机制是Transformer的核心，也是理解LLM工作原理的关键。

什么是注意力？

注意力机制让模型能够关注输入序列中最重要的部分。

直观理解

想象阅读句子：

"The bank of the river is beautiful."

当理解 "bank" 时，我们需要关注 "river" 来确定它的意思是"河岸"而不是"银行"。

graph LR A[bank] -->|注意力权重| B[river] A --> C[the] A --> D[of] A --> E[is] B -.->|高权重| F[确定含义] C -.->|低权重| F D -.->|低权重| F E -.->|低权重| F style B fill:#ff6b6b style F fill:#51cf66

自注意力计算

数学公式

$$Attention(Q, K, V) = softmax\left(\frac{QK^T}{\sqrt{d_k}}\right)V$$

其中： - Q (Query) - 查询向量 - K (Key) - 键向量 - V (Value) - 值向量 - d_k - 键向量维度

计算步骤

import torch
import torch.nn.functional as F
def scaled_dot_product_attention(query, key, value, mask=None):
"""
缩放点积注意力
Args:
query: [batch_size, num_heads, seq_len, d_k]
key: [batch_size, num_heads, seq_len, d_k]
value: [batch_size, num_heads, seq_len, d_k]
mask: 可选掩码
Returns:
注意力输出
"""
# 1. 计算注意力分数
scores = torch.matmul(query, key.transpose(-2, -1))  # [batch, heads, seq, seq]
# 2. 缩放（防止梯度消失）
d_k = query.size(-1)
scores = scores / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))
# 3. 应用掩码（可选）
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e9)
# 4. Softmax归一化
attention_weights = F.softmax(scores, dim=-1)
# 5. 加权求和
output = torch.matmul(attention_weights, value)
return output, attention_weights

Q、K、V的来源

在自注意力中，Q、K、V都来自同一个输入，通过三个不同的线性投影得到。

graph TB A[输入嵌入] --> B[Q = W_q × X] A --> C[K = W_k × X] A --> D[V = W_v × X] B --> E[注意力计算] C --> E D --> E E --> F[输出] style A fill:#e1f5ff style F fill:#c8e6c9

class MultiHeadAttention(nn.Module):
"""多头注意力"""
def __init__(self, d_model, num_heads):
super().__init__()
self.d_model = d_model
self.num_heads = num_heads
self.d_k = d_model // num_heads  # 每个头的维度
# Q、K、V的投影矩阵
self.W_q = nn.Linear(d_model, d_model)
self.W_k = nn.Linear(d_model, d_model)
self.W_v = nn.Linear(d_model, d_model)
# 输出投影
self.W_o = nn.Linear(d_model, d_model)
def forward(self, x, mask=None):
batch_size, seq_len, d_model = x.shape
# 1. 计算Q、K、V
Q = self.W_q(x)  # [batch, seq_len, d_model]
K = self.W_k(x)
V = self.W_v(x)
# 2. 分割为多头
Q = Q.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
K = K.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
V = V.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
# 现在: [batch, num_heads, seq_len, d_k]
# 3. 计算注意力
attn_output, attn_weights = scaled_dot_product_attention(Q, K, V, mask)
# 4. 合并多头
attn_output = attn_output.transpose(1, 2).contiguous()
attn_output = attn_output.view(batch_size, seq_len, d_model)
# 5. 输出投影
output = self.W_o(attn_output)
return output, attn_weights

多头注意力（Multi-Head Attention）

为什么需要多头？

每个头可以关注不同的信息
类似于用多个"视角"理解句子
增强模型表达能力

# 示例：4个头可以关注不同的方面
heads_attention = [
"语法结构",      # 头1关注主谓宾
"语义关系",      # 头2关注因果关系
"指代关系",      # 头3关注代词指代
"情感色彩"       # 头4关注情感倾向
]

graph TB A[输入] --> B[头1: 关注语法] A --> C[头2: 关注语义] A --> D[头3: 关注指代] A --> E[头4: 关注情感] B --> F[拼接] C --> F D --> F E --> F F --> G[线性投影] G --> H[最终输出] style A fill:#e1f5ff style H fill:#c8e6c9

注意力可视化

import matplotlib.pyplot as plt
import seaborn as sns
def visualize_attention(attention_weights, tokens):
"""
可视化注意力权重
Args:
attention_weights: [seq_len, seq_len] 注意力矩阵
tokens: token列表
"""
plt.figure(figsize=(10, 8))
sns.heatmap(
attention_weights.numpy(),
xticklabels=tokens,
yticklabels=tokens,
cmap="YlOrRd",
cbar=True
)
plt.xlabel('Key位置')
plt.ylabel('Query位置')
plt.title('注意力权重矩阵')
plt.show()
# 示例
tokens = ['The', 'cat', 'sat', 'on', 'the', 'mat']
attn_weights = torch.rand(len(tokens), len(tokens))
visualize_attention(attn_weights, tokens)

实际案例分析

案例1：代词指代

句子："The cat sat on the mat because it was comfortable."

# "it"的注意力分布（假设）
it_attention = {
'cat': 0.35,    # 可能是"it"
'mat': 0.55,    # 更可能是"it"
'sat': 0.05,
'was': 0.05
}

案例2：长距离依赖

句子："Mary had a little lamb whose fleece was white as snow."

# "snow"的注意力分布
snow_attention = {
'fleece': 0.40,  # 直接关联
'white': 0.35,   # 属性关联
'lamb': 0.15,   # 长距离关联
'Mary': 0.10     # 远距离弱关联
}

注意力的不同变体

1. 因果注意力（Causal Attention）

GPT等生成模型使用，防止看到未来信息。

def create_causal_mask(seq_len):
"""
创建因果掩码（下三角矩阵）
"""
mask = torch.tril(torch.ones(seq_len, seq_len))
return mask
# 示例
seq_len = 5
causal_mask = create_causal_mask(seq_len)
print(causal_mask)
# tensor([[1., 0., 0., 0., 0.],
#         [1., 1., 0., 0., 0.],
#         [1., 1., 1., 0., 0.],
#         [1., 1., 1., 1., 0.],
#         [1., 1., 1., 1., 1.]])

2. 填充掩码（Padding Mask）

处理不等长序列时使用。

def create_padding_mask(seq_len, actual_lengths):
"""
创建填充掩码
Args:
seq_len: 最大序列长度
actual_lengths: 每个序列的实际长度
"""
batch_size = len(actual_lengths)
mask = torch.ones(batch_size, seq_len)
for i, length in enumerate(actual_lengths):
mask[i, length:] = 0  # 填充部分为0
return mask

实践：完整的多头注意力

import torch
import torch.nn as nn
import torch.nn.functional as F
class AttentionBlock(nn.Module):
"""完整的注意力块"""
def __init__(self, d_model, num_heads, dropout=0.1):
super().__init__()
self.attention = MultiHeadAttention(d_model, num_heads)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.ffn = nn.Linear(d_model, d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask=None):
# 注意力 + 残差 + 归一化
attn_output, _ = self.attention(x, mask)
x = self.norm1(x + self.dropout(attn_output))
# 前馈 + 残差 + 归一化
ffn_output = self.ffn(x)
x = self.norm2(x + self.dropout(ffn_output))
return x
# 测试
batch_size = 4
seq_len = 16
d_model = 512
num_heads = 8
x = torch.randn(batch_size, seq_len, d_model)
block = AttentionBlock(d_model, num_heads)
output = block(x)
print(f"输入形状: {x.shape}")
print(f"输出形状: {output.shape}")

注意力的重要性

mindmap root((注意力机制)) 优势捕捉长距离依赖并行计算可解释性强全局信息整合应用机器翻译文本摘要问答系统代码生成创新自注意力多头注意力交叉注意力稀疏注意力

学习要点

✅ 注意力机制让模型关注重要信息 ✅ Q、K、V矩阵通过线性变换得到 ✅ 缩放因子防止梯度消失 ✅ 多头注意力提供多个"视角" ✅ 因果注意力用于生成任务 ✅ 注意力权重可可视化解释

下一步: 了解 LLM完整工作流程和训练过程 🔄