diff --git a/README.md b/README.md index 406c979..4fb9d00 100644 --- a/README.md +++ b/README.md @@ -400,7 +400,7 @@ Measured with `multi-bleu-detok.perl`: | Case-sensitive | 32.63 | 32.26 | 32.97 | 32.89 | | Case-insensitive | 34.06 | 33.70 | 34.36 | 34.28 | -Note: The result of [THUMT implementation](https://github.com/thumt/THUMT) is from [Accelerating Neural Transformer via an Average Attention Network](https://arxiv.org/abs/1805.00631). Averaging of models is not applied in the test of this implementation, since this experiment uses different settings for the saving of checkpoints, in which averaging model greatly hurts performance. Results with length penalty as THUMT applied is reported, but length penalty does not improve the performance of transformer in my experiments. Outputs of last encoder layer and decoder layer are not normalised in this experiment, after we add layer normalization to the output of last encoder layer and decoder layer, averaging of models can totally not work. +Note: The result of [THUMT implementation](https://github.com/thumt/THUMT) is from [Accelerating Neural Transformer via an Average Attention Network](https://arxiv.org/abs/1805.00631). Averaging of models is not applied in the test of this implementation, since when layer normalization is applied between residue connections, averaging model might hurts performance. Results with length penalty as THUMT applied is reported, but length penalty does not improve the performance of transformer in my experiments. Outputs of last encoder layer and decoder layer are not normalised in this experiment. 2, Settings: same with the first except the outputs of last encoder layer and decoder layer is normed and: diff --git a/transformer/AGG/HGraphEncoder.py b/transformer/AGG/HGraphEncoder.py deleted file mode 100644 index 69c01a9..0000000 --- a/transformer/AGG/HGraphEncoder.py +++ /dev/null @@ -1,148 +0,0 @@ -#encoding: utf-8 - -from torch import nn -from modules import * -from math import sqrt - -from transformer.Encoder import EncoderLayer as EncoderLayerUnit -from transformer.Encoder import Encoder as EncoderBase - -# vocabulary: -# :0 -# :1 -# :2 -# :3 -# ... -# for the classier of the decoder, is omitted - -class EncoderLayerBase(nn.Module): - - # isize: input size - # fhsize: hidden size of PositionwiseFeedForward - # attn_drop: dropout for MultiHeadAttention - # num_head: number of heads in MultiHeadAttention - # ahsize: hidden size of MultiHeadAttention - - def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, ahsize=None, num_sub=1): - - _ahsize = isize if ahsize is None else ahsize - - _fhsize = _ahsize * 4 if fhsize is None else fhsize - - super(EncoderLayerBase, self).__init__() - - self.nets = nn.ModuleList([EncoderLayerUnit(isize, _fhsize, dropout, attn_drop, num_head, _ahsize) for i in range(num_sub)]) - - self.combiner = ResidueCombiner(isize, num_sub, _fhsize) - - # inputs: input of this layer (bsize, seql, isize) - - def forward(self, inputs, mask=None): - - out = inputs - outs = [] - for net in self.nets: - out = net(out, mask) - outs.append(out) - - return self.combiner(*outs) - -class EncoderLayerStack(nn.Module): - - # isize: input size - # fhsize: hidden size of PositionwiseFeedForward - # attn_drop: dropout for MultiHeadAttention - # num_head: number of heads in MultiHeadAttention - # ahsize: hidden size of MultiHeadAttention - - def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, ahsize=None, num_sub=1): - - _ahsize = isize if ahsize is None else ahsize - - _fhsize = _ahsize * 4 if fhsize is None else fhsize - - super(EncoderLayerStack, self).__init__() - - self.nets = nn.ModuleList([EncoderLayerUnit(isize, _fhsize, dropout, attn_drop, num_head, _ahsize) for i in range(num_sub)]) - - # inputs: input of this layer (bsize, seql, isize) - - def forward(self, inputs, mask=None): - - out = inputs - for net in self.nets: - out = net(out, mask) - - return out - -class EncoderLayer(nn.Module): - - # isize: input size - # fhsize: hidden size of PositionwiseFeedForward - # attn_drop: dropout for MultiHeadAttention - # num_head: number of heads in MultiHeadAttention - # ahsize: hidden size of MultiHeadAttention - - def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, ahsize=None, num_sub=1, num_unit=1): - - _ahsize = isize if ahsize is None else ahsize - - _fhsize = _ahsize * 4 if fhsize is None else fhsize - - super(EncoderLayer, self).__init__() - - self.nets = nn.ModuleList([EncoderLayerBase(isize, _fhsize, dropout, attn_drop, num_head, _ahsize, num_unit) for i in range(num_sub)]) - - self.combiner = ResidueCombiner(isize, num_sub, _fhsize) - - # inputs: input of this layer (bsize, seql, isize) - - def forward(self, inputs, mask=None): - - out = inputs - outs = [] - for net in self.nets: - out = net(out, mask) - outs.append(out) - - return self.combiner(*outs) - - -class Encoder(EncoderBase): - - # isize: size of word embedding - # nwd: number of words - # num_layer: number of encoder layers - # fhsize: number of hidden units for PositionwiseFeedForward - # attn_drop: dropout for MultiHeadAttention - # num_head: number of heads in MultiHeadAttention - # xseql: maxmimum length of sequence - # ahsize: number of hidden units for MultiHeadAttention - - def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, xseql=512, ahsize=None, norm_output=False, num_sub=1, num_unit=1): - - _ahsize = isize if ahsize is None else ahsize - - _fhsize = _ahsize * 4 if fhsize is None else fhsize - - super(Encoder, self).__init__(isize, nwd, num_layer, _fhsize, dropout, attn_drop, num_head, xseql, _ahsize, norm_output) - - self.nets = nn.ModuleList([EncoderLayerStack(isize, _fhsize, dropout, attn_drop, num_head, _ahsize, num_layer - num_sub * num_unit), EncoderLayer(isize, _fhsize, dropout, attn_drop, num_head, _ahsize, num_sub, num_unit)]) - - def load_base(self, base_encoder): - - self.drop = base_encoder.drop - - self.wemb = base_encoder.wemb - - self.pemb = base_encoder.pemb - - _nets = base_encoder.nets - - net = list(self.nets.modules()) - net[0].nets[0].nets = nn.ModuleList(_nets[:2]) - net[0].nets[1].nets = nn.ModuleList(_nets[2:4]) - net[1].nets = nn.ModuleList(_nets[4:]) - - self.out_normer = None if self.out_normer is None else base_encoder.out_normer - self.nets[-1].combiner.out_normer = base_encoder.out_normer diff --git a/transformer/AGG/LGraphEncoder.py b/transformer/AGG/LGraphEncoder.py deleted file mode 100644 index 975d7f1..0000000 --- a/transformer/AGG/LGraphEncoder.py +++ /dev/null @@ -1,148 +0,0 @@ -#encoding: utf-8 - -from torch import nn -from modules import * -from math import sqrt - -from transformer.Encoder import EncoderLayer as EncoderLayerUnit -from transformer.Encoder import Encoder as EncoderBase - -# vocabulary: -# :0 -# :1 -# :2 -# :3 -# ... -# for the classier of the decoder, is omitted - -class EncoderLayerBase(nn.Module): - - # isize: input size - # fhsize: hidden size of PositionwiseFeedForward - # attn_drop: dropout for MultiHeadAttention - # num_head: number of heads in MultiHeadAttention - # ahsize: hidden size of MultiHeadAttention - - def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, ahsize=None, num_sub=1): - - _ahsize = isize if ahsize is None else ahsize - - _fhsize = _ahsize * 4 if fhsize is None else fhsize - - super(EncoderLayerBase, self).__init__() - - self.nets = nn.ModuleList([EncoderLayerUnit(isize, _fhsize, dropout, attn_drop, num_head, _ahsize) for i in range(num_sub)]) - - self.combiner = ResidueCombiner(isize, num_sub, _fhsize) - - # inputs: input of this layer (bsize, seql, isize) - - def forward(self, inputs, mask=None): - - out = inputs - outs = [] - for net in self.nets: - out = net(out, mask) - outs.append(out) - - return self.combiner(*outs) - -class EncoderLayerStack(nn.Module): - - # isize: input size - # fhsize: hidden size of PositionwiseFeedForward - # attn_drop: dropout for MultiHeadAttention - # num_head: number of heads in MultiHeadAttention - # ahsize: hidden size of MultiHeadAttention - - def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, ahsize=None, num_sub=1): - - _ahsize = isize if ahsize is None else ahsize - - _fhsize = _ahsize * 4 if fhsize is None else fhsize - - super(EncoderLayerStack, self).__init__() - - self.nets = nn.ModuleList([EncoderLayerUnit(isize, _fhsize, dropout, attn_drop, num_head, _ahsize) for i in range(num_sub)]) - - # inputs: input of this layer (bsize, seql, isize) - - def forward(self, inputs, mask=None): - - out = inputs - for net in self.nets: - out = net(out, mask) - - return out - -class EncoderLayer(nn.Module): - - # isize: input size - # fhsize: hidden size of PositionwiseFeedForward - # attn_drop: dropout for MultiHeadAttention - # num_head: number of heads in MultiHeadAttention - # ahsize: hidden size of MultiHeadAttention - - def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, ahsize=None, num_sub=1, num_unit=1): - - _ahsize = isize if ahsize is None else ahsize - - _fhsize = _ahsize * 4 if fhsize is None else fhsize - - super(EncoderLayer, self).__init__() - - self.nets = nn.ModuleList([EncoderLayerBase(isize, _fhsize, dropout, attn_drop, num_head, _ahsize, num_unit) for i in range(num_sub)]) - - self.combiner = ResidueCombiner(isize, num_sub, _fhsize) - - # inputs: input of this layer (bsize, seql, isize) - - def forward(self, inputs, mask=None): - - out = inputs - outs = [] - for net in self.nets: - out = net(out, mask) - outs.append(out) - - return self.combiner(*outs) - - -class Encoder(EncoderBase): - - # isize: size of word embedding - # nwd: number of words - # num_layer: number of encoder layers - # fhsize: number of hidden units for PositionwiseFeedForward - # attn_drop: dropout for MultiHeadAttention - # num_head: number of heads in MultiHeadAttention - # xseql: maxmimum length of sequence - # ahsize: number of hidden units for MultiHeadAttention - - def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, xseql=512, ahsize=None, norm_output=False, num_sub=1, num_unit=1): - - _ahsize = isize if ahsize is None else ahsize - - _fhsize = _ahsize * 4 if fhsize is None else fhsize - - super(Encoder, self).__init__(isize, nwd, num_layer, _fhsize, dropout, attn_drop, num_head, xseql, _ahsize, norm_output) - - self.nets = nn.ModuleList([EncoderLayer(isize, _fhsize, dropout, attn_drop, num_head, _ahsize, num_sub, num_unit), EncoderLayerStack(isize, _fhsize, dropout, attn_drop, num_head, _ahsize, num_layer - num_sub * num_unit)]) - - def load_base(self, base_encoder): - - self.drop = base_encoder.drop - - self.wemb = base_encoder.wemb - - self.pemb = base_encoder.pemb - - _nets = base_encoder.nets - - net = list(self.nets.modules()) - net[0].nets[0].nets = nn.ModuleList(_nets[:2]) - net[0].nets[1].nets = nn.ModuleList(_nets[2:4]) - net[1].nets = nn.ModuleList(_nets[4:]) - - self.out_normer = None if self.out_normer is None else base_encoder.out_normer - self.nets[-1].combiner.out_normer = base_encoder.out_normer diff --git a/transformer/AvgDecoder.py b/transformer/AvgDecoder.py index 04a01ff..e65d045 100644 --- a/transformer/AvgDecoder.py +++ b/transformer/AvgDecoder.py @@ -17,7 +17,7 @@ class DecoderLayer(nn.Module): # num_head: number of heads in MultiHeadAttention # ahsize: hidden size of MultiHeadAttention - def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, ahsize=None, norm_residue=False): + def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, ahsize=None, norm_residue=True): super(DecoderLayer, self).__init__() diff --git a/transformer/Decoder.py b/transformer/Decoder.py index b652d78..4c3881a 100644 --- a/transformer/Decoder.py +++ b/transformer/Decoder.py @@ -14,7 +14,7 @@ class DecoderLayer(nn.Module): # ahsize: hidden size of MultiHeadAttention # norm_residue: residue with layer normalized representation - def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, ahsize=None, norm_residue=False): + def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, ahsize=None, norm_residue=True): super(DecoderLayer, self).__init__() diff --git a/transformer/Encoder.py b/transformer/Encoder.py index 6a12b43..6bcc44c 100644 --- a/transformer/Encoder.py +++ b/transformer/Encoder.py @@ -21,7 +21,7 @@ class EncoderLayer(nn.Module): # ahsize: hidden size of MultiHeadAttention # norm_residue: residue with layer normalized representation - def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, ahsize=None, norm_residue=False): + def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, ahsize=None, norm_residue=True): super(EncoderLayer, self).__init__()