transformer

image

  • Transformer 中 Encoder 由 6 个相同的层组成,每个层包含 2 个部分:
    • Multi-Head Self-Attention
    • Position-Wise Feed-Forward Network
  • Decoder 也是由 6 个相同的层组成,每个层包含 3 个部分:
    • Multi-Head Self-Attention
    • Multi-Head Context-Attention
    • Position-Wise Feed-Forward Network

bert代码
transformer的左侧部分是由这样的6个encoder单元组成,现在把EncoderLayer堆叠起来。embedding-->pos_encoding-->dropout-->EncoderLayers

Encoder code
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
class Encoder(tf.keras.layers.Layer):
def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
maximum_position_encoding, rate=0.1):
super(Encoder, self).__init__()

self.d_model = d_model
self.num_layers = num_layers

self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
self.pos_encoding = positional_encoding(maximum_position_encoding,
self.d_model)


self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
for _ in range(num_layers)]

self.dropout = tf.keras.layers.Dropout(rate)

def call(self, x, training, mask):

seq_len = tf.shape(x)[1]

# 将嵌入和位置编码相加。
x = self.embedding(x) # (batch_size, input_seq_len, d_model)
x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
x += self.pos_encoding[:, :seq_len, :]

x = self.dropout(x, training=training)

for i in range(self.num_layers):
x = self.enc_layers[i](x, training, mask)

return x # (batch_size, input_seq_len, d_model)

transformer的右侧部分是由这样的6个decoder单元组成,现在把DecoderLayer堆叠起来。与Encoder不同的是,Decoder返回的结果中,包含了用字典存储的每一层的attention_weights。embedding-->pos_encoding--> dropout-->DecoderLayers

Decoder code
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
class Decoder(tf.keras.layers.Layer):
def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
maximum_position_encoding, rate=0.1):
super(Decoder, self).__init__()

self.d_model = d_model
self.num_layers = num_layers

self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate)
for _ in range(num_layers)]
self.dropout = tf.keras.layers.Dropout(rate)

def call(self, x, enc_output, training,
look_ahead_mask, padding_mask):

seq_len = tf.shape(x)[1]
# 用字典保存每次attention的结果
attention_weights = {}

x = self.embedding(x) # (batch_size, target_seq_len, d_model)
x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
x += self.pos_encoding[:, :seq_len, :]

x = self.dropout(x, training=training)

for i in range(self.num_layers):
x, block1, block2 = self.dec_layers[i](x, enc_output, training,
look_ahead_mask, padding_mask)

attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
attention_weights['decoder_layer{}_block2'.format(i+1)] = block2

# x.shape == (batch_size, target_seq_len, d_model)
return x, attention_weights

搭建Transformer类,Encoder-->Decoder-->final_layerfinal_layer是与vocab_size大小保持一致的全连接层。

Transformer code
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
class Transformer(tf.keras.Model):
def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
target_vocab_size, pe_input, pe_target, rate=0.1):
super(Transformer, self).__init__()

self.encoder = Encoder(num_layers, d_model, num_heads, dff,
input_vocab_size, pe_input, rate)

self.decoder = Decoder(num_layers, d_model, num_heads, dff,
target_vocab_size, pe_target, rate)

self.final_layer = tf.keras.layers.Dense(target_vocab_size)

def call(self, inp, tar, training, enc_padding_mask,
look_ahead_mask, dec_padding_mask):

enc_output = self.encoder(inp, training, enc_padding_mask) # (batch_size, inp_seq_len, d_model)

# dec_output.shape == (batch_size, tar_seq_len, d_model)
dec_output, attention_weights = self.decoder(
tar, enc_output, training, look_ahead_mask, dec_padding_mask)

final_output = self.final_layer(dec_output) # (batch_size, tar_seq_len, target_vocab_size)

return final_output, attention_weights