generated from gursi26/paper-implementation-template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathencoder.py
51 lines (40 loc) · 1.77 KB
/
encoder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from torch import nn
from attention import MultiHeadAttention
from other_modules import FeedForward, PositionalEncoder
import torch, math
class TransformerEncoderBlock(nn.Module):
def __init__(self, input_dim, num_heads, p=0.1):
super(TransformerEncoderBlock, self).__init__()
self.mha = MultiHeadAttention(input_dim, input_dim, num_heads)
self.layer_norm1 = nn.LayerNorm(input_dim)
self.dropout1 = nn.Dropout(p=p)
self.feed_forward = FeedForward(input_dim)
self.layer_norm2 = nn.LayerNorm(input_dim)
self.dropout2 = nn.Dropout(p=p)
def forward(self, x, mask=None):
skip_x = x
x = self.mha(x, mask=mask)
x = self.layer_norm1(self.dropout1(x) + skip_x)
skip_x = x
x = self.feed_forward(x)
return self.layer_norm2(self.dropout2(x) + skip_x)
class TransformerEncoder(nn.Module):
def __init__(self, input_dim, d_model, num_heads, n_layers, max_seq_len=4096):
super(TransformerEncoder, self).__init__()
self.d_model = d_model
self.embed = nn.Embedding(input_dim, d_model)
self.pos_enc = PositionalEncoder(d_model, max_seq_len)
self.encoder_layers = nn.ModuleList([TransformerEncoderBlock(d_model, num_heads) for _ in range(n_layers)])
def forward(self, x, mask):
x = self.pos_enc(self.embed(x) * math.sqrt(self.d_model))
for layer in self.encoder_layers:
x = layer(x, mask)
return x
def test_transformer_encoder():
encoder = TransformerEncoder(5000, 512, 8, 3).to("mps")
x = torch.arange(0, 100).view(1, 100).repeat(32, 1).to("mps")
mask = torch.ones(32, 8, 100, 100).type(torch.bool).to("mps")
out = encoder(x, mask)
print(out.shape)
if __name__ == "__main__":
test_transformer_encoder()