关于Transformer的原理建议读论文或者看看其他人的博客
推荐:
- 博客1
- Transformer中的mask机制超详细讲解
- 层层剖析,让你彻底搞懂Self-Attention、MultiHead-Attention和Masked-Attention的机制和原理
- transformer中QKV的通俗理解(渣男与备胎的故事)
- 史上最小白之Transformer详解
模型代码
对参考代码做了一点修改和包装。
transformer模型
import math
import torch
import torch.nn as nn
import numpy as np
import argparse
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--d_k", type=int, default=64, help="dim of attention's Key")
parser.add_argument("--d_v", type=int, default=64, help="dim of attention's Value")
parser.add_argument("--d_model", type=int, default=512, help="embedding dim of word")
parser.add_argument("--n_heads", type=int, default=8, help="head of ScaledDotProductAttention")
parser.add_argument("--device", type=str, default="cuda", help="device of training")
parser.add_argument("--d_ff", type=int, default=2048, help="dim of hidden layer")
parser.add_argument("--n_layers", type=int, default=6, help="number of Encoder of Decoder Layer(Block的个数)")
args = parser.parse_args()
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.2, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model) # 5000*512
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) # 5000*1
div_term = torch.exp(torch.arange(
0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # 256
pe[:, 0::2] = torch.sin(position * div_term) # 5000 * 256
pe[:, 1::2] = torch.cos(position * div_term) # 5000 * 256 1*5000*512
pe = pe.unsqueeze(0).transpose(0, 1) # 5000*1*512
self.register_buffer('pe', pe)
def forward(self, x):
"""
x: [seq_len, batch_size, d_model]
"""
x = x + self.pe[:x.size(0), :]
return self.dropout(x)
def get_attn_pad_mask(seq_q, seq_k):
# pad mask的作用:在对value向量加权平均的时候,可以让pad对应的alpha_ij=0,这样注意力就不会考虑到pad向量
"""这里的q,k表示的是两个序列(跟注意力机制的q,k没有关系),例如encoder_inputs (x1,x2,..xm)和encoder_inputs (x1,x2..xm)
encoder和decoder都可能调用这个函数,所以seq_len视情况而定
seq_q: [batch_size, seq_len]
seq_k: [batch_size, seq_len]
seq_len could be src_len or it could be tgt_len
seq_len in seq_q and seq_len in seq_k maybe not equal
"""
batch_size, len_q = seq_q.size() # 这个seq_q只是用来expand维度的
batch_size, len_k = seq_k.size()
# eq(zero) is PAD token
# 例如:seq_k = [[1,2,3,4,0], [1,2,3,5,0]]
# [batch_size, 1, len_k], True is masked
pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)
# [batch_size, len_q, len_k] 构成一个立方体(batch_size个这样的矩阵)
return pad_attn_mask.expand(batch_size, len_q, len_k)
def get_attn_subsequence_mask(seq): # 输入为batch_size*seq_len
"""建议打印出来看看是什么的输出(一目了然)
seq: [batch_size, tgt_len]
"""
attn_shape = [seq.size(0), seq.size(1), seq.size(1)]
# attn_shape: [batch_size, tgt_len, tgt_len]
subsequence_mask = np.triu(np.ones(attn_shape), k=1) # 生成一个上三角矩阵
subsequence_mask = torch.from_numpy(subsequence_mask).byte()
return subsequence_mask # [batch_size, tgt_len, tgt_len]
class ScaledDotProductAttention(nn.Module):
def __init__(self):
super(ScaledDotProductAttention, self).__init__()
def forward(self, Q, K, V, attn_mask):
"""
Q: [batch_size, n_heads, len_q, d_k]
K: [batch_size, n_heads, len_k, d_k]
V: [batch_size, n_heads, len_v(=len_k), d_v]
attn_mask: [batch_size, n_heads, seq_len, seq_len]
说明:在encoder-decoder的Attention层中len_q(q1,..qt)和len_k(k1,...km)可能不同
补充:transformer的Encoder和Decoder中实现的Mask操作都是一样的,只要控制传入的attn_mask,即可实现<PAD>mask和decoder的mask
"""
scores = torch.matmul(Q, K.transpose(-1, -2)) / \
np.sqrt(args.d_k) # scores : [batch_size, n_heads, len_q, len_k]
# mask矩阵填充scores(用-1e9填充scores中与attn_mask中值为1位置相对应的元素)
# Fills elements of self tensor with value where mask is True.
scores.masked_fill_(attn_mask, -1e9)
# Fills elements of self tensor with value where mask is True. The shape of mask must be broadcastable with the shape of the underlying tensor.
# Q和K在点积之后,需要先经过mask再进行softmax,因此,对于要屏蔽的部分,mask之后的输出需要为负无穷,这样softmax之后输出才为0。
attn = nn.Softmax(dim=-1)(scores) # 对最后一个维度(v)做softmax
# scores : [batch_size, n_heads, len_q, len_k] * V: [batch_size, n_heads, len_v(=len_k), d_v]
# context: [batch_size, n_heads, len_q, d_v]
context = torch.matmul(attn, V)
# context:[[z1,z2,...],[...]]向量, attn注意力稀疏矩阵(用于可视化的)
return context, attn
class MultiHeadAttention(nn.Module):
"""这个Attention类可以实现:
Encoder的Self-Attention
Decoder的Masked Self-Attention
Encoder-Decoder的Attention
输入:seq_len x d_model
输出:seq_len x d_model
"""
def __init__(self):
super(MultiHeadAttention, self).__init__()
self.W_Q = nn.Linear(args.d_model, args.d_k * args.n_heads,
bias=False) # q,k必须维度相同,不然无法做点积
self.W_K = nn.Linear(args.d_model, args.d_k * args.n_heads, bias=False)
self.W_V = nn.Linear(args.d_model, args.d_v * args.n_heads, bias=False)
# 这个全连接层可以保证多头attention的输出仍然是seq_len x d_model
self.fc = nn