Skip to content
This repository has been archived by the owner on Aug 10, 2023. It is now read-only.

Commit

Permalink
merge April 2020 updates of transformer.edge
Browse files Browse the repository at this point in the history
  • Loading branch information
liuqiuhui2015 committed Apr 30, 2020
1 parent 56ca23b commit d6546ed
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 21 deletions.
2 changes: 1 addition & 1 deletion cnfg/ihyp.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
override_GeLU_Sigmoid = True
elif _adv_act == "swish":
override_GeLU_Swish = True
inplace_after_GeLU = use_adv_act_default and not override_GeLU_Sigmoid
inplace_after_GeLU = use_adv_act_default and (not override_GeLU_Sigmoid)

norm_residual_default = not (computation_order.lower() == "v2")

Expand Down
48 changes: 33 additions & 15 deletions modules/rnncells.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,28 +17,36 @@ def prepare_initState(hx, cx, bsize):
class LSTMCell4RNMT(nn.Module):

# isize: input size of Feed-forward NN
# dropout: dropout over hidden units, disabling it and applying dropout to outputs (_out) in most cases

def __init__(self, isize, osize, use_GeLU=use_adv_act_default):
def __init__(self, isize, osize=None, dropout=0.0, use_GeLU=use_adv_act_default, enable_bias=enable_residual_bias_default):

super(LSTMCell4RNMT, self).__init__()

# layer normalization is also applied for the computation of hidden for efficiency
self.trans = Linear(isize + osize, osize * 4)
self.normer = nn.LayerNorm((4, osize), eps=ieps_ln_default, elementwise_affine=enable_ln_parameters)
_osize = isize if osize is None else osize

# layer normalization is also applied for the computation of hidden for efficiency. bias might be disabled in case provided by LayerNorm
self.trans = Linear(isize + _osize, _osize * 4, bias=enable_bias)
self.normer = nn.LayerNorm((4, _osize), eps=ieps_ln_default, elementwise_affine=enable_ln_parameters)

self.act = GeLU() if use_GeLU else nn.Tanh()
self.drop = Dropout(dropout, inplace=inplace_after_GeLU) if dropout > 0.0 else None

self.osize = osize
self.osize = _osize

def forward(self, inpute, state):

_out, _cell = state

_comb = self.normer(self.trans(torch.cat((inpute, _out), -1)).view(-1, 4, self.osize))
osize = list(_out.size())
osize.insert(-1, 4)

_comb = self.normer(self.trans(torch.cat((inpute, _out,), -1)).view(osize))

_combg, hidden = _comb.narrow(-2, 0, 3).sigmoid(), self.act(_comb.select(-2, 3))
(ig, fg, og,), hidden = _comb.narrow(-2, 0, 3).sigmoid().unbind(-2), self.act(_comb.select(-2, 3))

ig, fg, og = _combg.unbind(-2)
if self.drop is not None:
hidden = self.drop(hidden)

_cell = fg * _cell + ig * hidden
_out = og * _cell
Expand All @@ -49,27 +57,37 @@ class GRUCell4RNMT(nn.Module):

# isize: input size of Feed-forward NN

def __init__(self, isize, osize, use_GeLU=use_adv_act_default):
def __init__(self, isize, osize=None, dropout=0.0, use_GeLU=use_adv_act_default, enable_bias=enable_residual_bias_default):

super(GRUCell4RNMT, self).__init__()

self.trans = Linear(isize + osize, osize * 2)
self.transi = Linear(isize, osize)
self.transh = Linear(osize, osize)
_osize = isize if osize is None else osize

self.trans = Linear(isize + _osize, _osize * 2, bias=enable_bias)
self.transi = Linear(isize, _osize)
self.transh = Linear(_osize, _osize)

self.normer = nn.LayerNorm((2, osize), eps=ieps_ln_default, elementwise_affine=enable_ln_parameters)
self.normer = nn.LayerNorm((2, _osize), eps=ieps_ln_default, elementwise_affine=enable_ln_parameters)

self.act = GeLU() if use_GeLU else nn.Tanh()
self.drop = Dropout(dropout, inplace=inplace_after_GeLU) if dropout > 0.0 else None

self.osize = osize
self.osize = _osize

def forward(self, inpute, state):

_comb = self.normer(self.trans(torch.cat((inpute, state), -1)).view(-1, 2, self.osize)).sigmoid()
osize = list(state.size())
osize.insert(-1, 2)

_comb = self.normer(self.trans(torch.cat((inpute, state,), -1)).view(osize)).sigmoid()

ig, fg = _comb.unbind(-2)

hidden = self.act(self.transi(inpute) + ig * self.transh(state))

if self.drop is not None:
hidden = self.drop(hidden)

_out = (1.0 - fg) * hidden + fg * state

return _out
Expand Down
5 changes: 3 additions & 2 deletions parallel/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def make_replicas(self):
def collect_gradients(self):

if self.ngradev > 1:
# in case some parameters might not be used during the forward propagation on some GPUs: p.data.new_zeros(p.data.size()) if p.grad is None else p.grad instead of p.grad, but in most cases, this can warn you in case you miss the use of some parameters in the forward computation.
grads = comm.reduce_add_coalesced([[p.grad for p in filter_para_grad(net.parameters())] for net in self.nets[:self.ngradev]], self.output_device)# if self.ngradev > 1 else [p.grad for p in filter_para_grad(self.nets[0].parameters())]
for mp, grad in zip(filter_para_grad(self.module.parameters()), grads):
mp.grad = grad
Expand Down Expand Up @@ -123,7 +124,7 @@ def zero_replicas_grad(self):
if self.nets is not None and self.ngradev > 1:
for net in self.nets[1:self.ngradev]:
for para in filter_para_grad(net.parameters()):
net.grad = None
para.grad = None

def reset_grad(self):

Expand All @@ -132,7 +133,7 @@ def reset_grad(self):
if self.nets is not None and self.ngradev > 1:
for net in self.nets[1:self.ngradev]:
for para in filter_para_grad(net.parameters()):
net.grad = None
para.grad = None
self.ngradev = 0

class DataParallelCriterion(DataParallel):
Expand Down
18 changes: 15 additions & 3 deletions tools/average_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,33 @@

import torch

from utils.base import mask_tensor_type
from utils.h5serial import h5save, h5load

from cnfg.ihyp import *

def handle(srcfl, rsf):

type_map = {torch.float16: torch.float64, torch.float32: torch.float64, torch.uint8: torch.int64, torch.int8: torch.int64, torch.int16: torch.int64, torch.int32: torch.int64}
type_map[mask_tensor_type] = torch.int64

rsm = h5load(srcfl[0])

src_type = [para.dtype for para in rsm]
map_type = [type_map[para.dtype] if para.dtype in type_map else None for para in rsm]
sec_rsm = [para if typ is None else para.to(typ) for para, typ in zip(rsm, map_type)]

nmodel = 1
for modelf in srcfl[1:]:
for basep, mpload in zip(rsm, h5load(modelf)):
basep.add_(mpload)
for basep, mpload, typ in zip(sec_rsm, h5load(modelf), map_type):
basep.add_(mpload if typ is None else mpload.to(typ))
nmodel += 1
nmodel = float(nmodel)
for basep in rsm:
for basep in sec_rsm:
basep.div_(nmodel)

rsm = [para if mtyp is None else para.to(styp) for para, mtyp, styp in zip(sec_rsm, map_type, src_type)]

h5save(rsm, rsf, h5args=h5zipargs)

if __name__ == "__main__":
Expand Down

0 comments on commit d6546ed

Please sign in to comment.