Skip to content
This repository has been archived by the owner on Aug 10, 2023. It is now read-only.

Commit

Permalink
April updates
Browse files Browse the repository at this point in the history
  • Loading branch information
liuqiuhui2015 committed Apr 26, 2020
1 parent 362ed8d commit 56ca23b
Show file tree
Hide file tree
Showing 25 changed files with 700 additions and 52 deletions.
4 changes: 3 additions & 1 deletion adv/train/train_dynb.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def select_function(modin, select_index):

return _sel_m.parameters()

grad_mon = GradientMonitor(num_layer * 2, select_function, angle_alpha=cnfg.dyn_tol_alpha, num_tol_amin=cnfg.dyn_tol_amin, num_his_recoder=cnfg.num_dynb_his, num_his_gm=1)
grad_mon = GradientMonitor(num_layer * 2, select_function, module=None, angle_alpha=cnfg.dyn_tol_alpha, num_tol_amin=cnfg.dyn_tol_amin, num_his_recoder=cnfg.num_dynb_his, num_his_gm=1)

def train(td, tl, ed, nd, optm, lrsch, model, lossf, mv_device, logger, done_tokens, multi_gpu, tokens_optm=32768, nreport=None, save_every=None, chkpf=None, chkpof=None, statesf=None, num_checkpoint=1, cur_checkid=0, report_eva=True, remain_steps=None, save_loss=False, save_checkp_epoch=False, use_amp=False):

Expand Down Expand Up @@ -95,6 +95,8 @@ def train(td, tl, ed, nd, optm, lrsch, model, lossf, mv_device, logger, done_tok
_perform_dyn_optm_step, _cos_sim = grad_mon.update(model.module if multi_gpu else model)

if _perform_dyn_optm_step or (_done_tokens >= tokens_optm):
if not _perform_dyn_optm_step:
grad_mon.reset()
_do_optm_step = True if _cos_sim is None else (_cos_sim <= update_angle)
if _do_optm_step:
if multi_gpu:
Expand Down
11 changes: 5 additions & 6 deletions cnfg/ihyp.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@

from utils.fmt.base import parse_none, parse_double_value_tuple

if ease_optimization:
enable_residual_bias_default = False
else:
enable_residual_bias_default = True
enable_residual_bias_default = not ease_optimization

enable_ln_parameters = True

use_adv_act_default = False
override_GeLU_Swish = False
Expand All @@ -35,14 +34,14 @@
ieps_default = 1e-9
ieps_ln_default = 1e-6
ieps_adam_default = 1e-9
ieps_noise_default = ieps_ln_default

ieps_ln_default = parse_none(ieps_ln_default, ieps_default)
ieps_adam_default = parse_none(ieps_adam_default, ieps_default)
ieps_noise_default = ieps_ln_default

adam_betas_default = (0.9, 0.98,)

use_k_relative_position_encoder, use_k_relative_position_decoder = parse_double_value_tuple(use_k_relative_position)
rel_pos_enabled = (max(use_k_relative_position_encoder, use_k_relative_position_decoder) > 0)
disable_std_pemb_encoder, disable_std_pemb_decoder = parse_double_value_tuple(disable_std_pemb)

h5datawargs = {} if hdf5_data_compression is None else {"compression": hdf5_data_compression, "compression_opts": hdf5_data_compression_level, "shuffle":True}
Expand Down
66 changes: 66 additions & 0 deletions modules/LD.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#encoding: utf-8

import torch
from torch import nn

from modules.base import Scorer, Linear, Dropout
from modules.act import GeLU

from cnfg.ihyp import *

class ATTNCombiner(nn.Module):

def __init__(self, isize, hsize=None, dropout=0.0, use_GeLU=use_adv_act_default):

super(ATTNCombiner, self).__init__()

_hsize = isize * 4 if hsize is None else hsize

self.net = nn.Sequential(Linear(isize * 2, _hsize), Dropout(dropout, inplace=True), GeLU() if use_GeLU else nn.Sigmoid(), Scorer(_hsize), nn.Sigmoid()) if dropout > 0.0 else nn.Sequential(Linear(isize * 2, _hsize), GeLU() if use_GeLU else nn.Sigmoid(), Scorer(_hsize), nn.Sigmoid())

def forward(self, input1, input2, mask=None):

scores = self.net(torch.cat((input1.expand_as(input2), input2,), dim=-1))

_seql = input2.size(-2)
if mask is not None:
_tm = mask.sum(-2, keepdim=True)
_nele = (_seql - _tm).masked_fill(_tm.eq(_seql), 1).to(scores)
scores = scores / _nele
else:
scores = scores / _seql
scores = scores.masked_fill(mask, 0.0)

out = scores.transpose(1, 2).bmm(input2) + (1.0 - scores.sum(dim=-2, keepdim=True)) * input1

return out

class DATTNCombiner(nn.Module):

def __init__(self, isize, hsize=None, dropout=0.0, use_GeLU=use_adv_act_default):

super(DATTNCombiner, self).__init__()

_hsize = isize * 4 if hsize is None else hsize

self.net = nn.Sequential(Linear(isize * 2, _hsize), Dropout(dropout, inplace=True), GeLU() if use_GeLU else nn.Sigmoid(), Scorer(_hsize, bias=False)) if dropout > 0.0 else nn.Sequential(Linear(isize * 2, _hsize), GeLU() if use_GeLU else nn.Sigmoid(), Scorer(_hsize, bias=False))

# input1: (bsize, 1, isize)
# input2: (bsize, seql, isize)
# mask: (bsize, seql, 1)
def forward(self, input1, input2, mask=None):

# scores: (bsize, seql, 1)
scores = self.net(torch.cat((input1.expand_as(input2), input2,), dim=-1))

_seql = input2.size(-2)
if mask is not None:
# using math.inf as inf_default will lead to nan after softmax in case a sequence is full of <pad> tokens, advice: using the other values as inf_default, like 1e9.
scores = scores.masked_fill(mask, -inf_default)

scores = scores.softmax(dim=-2)

# out: (bsize, 1, isize)
out = scores.transpose(1, 2).bmm(input2)

return out
5 changes: 3 additions & 2 deletions modules/act.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,9 @@ def forward(self, x):

def fix_init(self):

if self.reset_beta is not None:
self.beta.fill_(self.reset_beta)
with torch.no_grad():
if self.reset_beta is not None:
self.beta.fill_(self.reset_beta)

if override_GeLU_Swish:
GeLU = Swish
Expand Down
30 changes: 16 additions & 14 deletions modules/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def __init__(self, isize, hsize=None, dropout=0.0, norm_residual=norm_residual_d

self.net = nn.Sequential(Linear(isize, _hsize), GeLU() if use_GeLU else nn.ReLU(inplace=True), Dropout(dropout, inplace=inplace_after_GeLU), Linear(_hsize, isize, bias=enable_bias), Dropout(dropout, inplace=True)) if dropout > 0.0 else nn.Sequential(Linear(isize, _hsize), GeLU() if use_GeLU else nn.ReLU(inplace=True), Linear(_hsize, isize, bias=enable_bias))

self.normer = nn.LayerNorm(isize, eps=ieps_ln_default)
self.normer = nn.LayerNorm(isize, eps=ieps_ln_default, elementwise_affine=enable_ln_parameters)

self.norm_residual = norm_residual

Expand Down Expand Up @@ -138,7 +138,7 @@ def __init__(self, isize, hsize, osize, num_head=8, dropout=0.0, k_isize=None, v
if k_rel_pos > 0:
self.k_rel_pos = k_rel_pos
self.rel_pemb = nn.Embedding(k_rel_pos * 2 + 1, self.attn_dim)
_rpm = torch.arange(-xseql + 1, 1).unsqueeze(0)
_rpm = torch.arange(-xseql + 1, 1, dtype=torch.long).unsqueeze(0)
self.register_buffer("rel_pos", (_rpm - _rpm.t()).clamp(min=-k_rel_pos, max=k_rel_pos) + k_rel_pos)
self.xseql = xseql
# the buffer can be shared inside the encoder or the decoder across layers for saving memory, by setting self.ref_rel_posm of self attns in deep layers to SelfAttn in layer 0, and sharing corresponding self.rel_pos
Expand Down Expand Up @@ -275,7 +275,7 @@ def __init__(self, isize, hsize, osize, num_head=8, dropout=0.0, enable_bias=ena
if k_rel_pos > 0:
self.k_rel_pos = k_rel_pos
self.rel_pemb = nn.Embedding(k_rel_pos * 2 + 1, self.attn_dim)
_rpm = torch.arange(-xseql + 1, 1).unsqueeze(0)
_rpm = torch.arange(-xseql + 1, 1, dtype=torch.long).unsqueeze(0)
self.register_buffer("rel_pos", (_rpm - _rpm.t()).clamp(min=-k_rel_pos, max=k_rel_pos) + k_rel_pos)
self.xseql = xseql
# the buffer can be shared inside the encoder or the decoder across layers for saving memory, by setting self.ref_rel_posm of self attns in deep layers to SelfAttn in layer 0, and sharing corresponding self.rel_pos
Expand Down Expand Up @@ -399,7 +399,7 @@ def __init__(self, isize, ncomb=2, hsize=None, dropout=0.0, use_GeLU=use_adv_act
# should dropout be in front of sigmoid or not?
self.net = nn.Sequential(Linear(isize * ncomb, _hsize), GeLU() if use_GeLU else nn.Sigmoid(), Dropout(dropout, inplace=inplace_after_GeLU), Linear(_hsize, isize, bias=enable_bias), Dropout(dropout, inplace=True)) if dropout > 0.0 else nn.Sequential(Linear(isize * ncomb, _hsize), GeLU() if use_GeLU else nn.Sigmoid(), Linear(_hsize, isize, bias=enable_bias))

self.out_normer = nn.LayerNorm(isize, eps=ieps_ln_default)
self.out_normer = nn.LayerNorm(isize, eps=ieps_ln_default, elementwise_affine=enable_ln_parameters)

def forward(self, *xl):

Expand Down Expand Up @@ -503,7 +503,7 @@ def _threshold_and_support(input, dim=0):
def _make_ix_like(input, dim=0):

d = input.size(dim)
rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype)
rho = torch.arange(1, d + 1, dtype=input.dtype, device=input.device)
view = [1] * input.dim()
view[0] = -1

Expand Down Expand Up @@ -574,36 +574,36 @@ class SparseNormer(nn.Module):

# dim: dimension to normalize

def __init__(self, dim=-1, ieps=1e-32):
def __init__(self, dim=-1, eps=ieps_default):

super(SparseNormer, self).__init__()

self.dim = dim
self.bias = nn.Parameter(torch.zeros(1))
self.act = nn.ReLU(inplace=True)
self.ieps = ieps
self.eps = eps

def forward(self, x):

_tmp = self.act(x + self.bias)
_tmp = _tmp * _tmp

# fix zero-devision in case all elements in _tmp are 0.
return _tmp / (_tmp.sum(self.dim, keepdim=True) + self.ieps)
return _tmp / (_tmp.sum(self.dim, keepdim=True) + self.eps)

class MHSparseNormer(nn.Module):

# nheads: number of heads
# dim: dimension to normalize

def __init__(self, nheads, dim=-1, ieps=1e-32):
def __init__(self, nheads, dim=-1, eps=ieps_default):

super(MHSparseNormer, self).__init__()

self.dim = dim
self.bias = nn.Parameter(torch.zeros(1, nheads, 1, 1))
self.act = nn.ReLU(inplace=True)
self.ieps = ieps
self.eps = eps

# input should be: (bsize, nheads, nquery, seql)
def forward(self, x):
Expand All @@ -612,11 +612,12 @@ def forward(self, x):
_tmp = _tmp * _tmp

# fix zero-devision in case all elements in _tmp are 0.
return _tmp / (_tmp.sum(self.dim, keepdim=True) + self.ieps)
return _tmp / (_tmp.sum(self.dim, keepdim=True) + self.eps)

def fix_init(self):

self.bias.data.zero_()
with torch.no_grad():
self.bias.data.zero_()

class MHAttnSummer(nn.Module):

Expand Down Expand Up @@ -753,8 +754,9 @@ def forward(self, x):

def fix_init(self):

self.k.data.fill_(1.0)
self.bias.data.zero_()
with torch.no_grad():
self.k.data.fill_(1.0)
self.bias.data.zero_()

def reduce_model(modin):

Expand Down
4 changes: 2 additions & 2 deletions modules/rnncells.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def __init__(self, isize, osize, use_GeLU=use_adv_act_default):

# layer normalization is also applied for the computation of hidden for efficiency
self.trans = Linear(isize + osize, osize * 4)
self.normer = nn.LayerNorm((4, osize), eps=1e-06)
self.normer = nn.LayerNorm((4, osize), eps=ieps_ln_default, elementwise_affine=enable_ln_parameters)

self.act = GeLU() if use_GeLU else nn.Tanh()

Expand Down Expand Up @@ -57,7 +57,7 @@ def __init__(self, isize, osize, use_GeLU=use_adv_act_default):
self.transi = Linear(isize, osize)
self.transh = Linear(osize, osize)

self.normer = nn.LayerNorm((2, osize), eps=1e-06)
self.normer = nn.LayerNorm((2, osize), eps=ieps_ln_default, elementwise_affine=enable_ln_parameters)

self.act = GeLU() if use_GeLU else nn.Tanh()

Expand Down
4 changes: 3 additions & 1 deletion tools/check/dynb/report_dynb.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def select_function(modin, select_index):

return _sel_m.parameters()

grad_mon = GradientMonitor(num_layer * 2, select_function, angle_alpha=cnfg.dyn_tol_alpha, num_tol_amin=cnfg.dyn_tol_amin, num_his_recoder=cnfg.num_dynb_his, num_his_gm=max_his)
grad_mon = GradientMonitor(num_layer * 2, select_function, module=None, angle_alpha=cnfg.dyn_tol_alpha, num_tol_amin=cnfg.dyn_tol_amin, num_his_recoder=cnfg.num_dynb_his, num_his_gm=max_his)

def train(td, tl, ed, nd, optm, lrsch, model, lossf, mv_device, logger, done_tokens, multi_gpu, tokens_optm=32768, nreport=None, save_every=None, chkpf=None, chkpof=None, statesf=None, num_checkpoint=1, cur_checkid=0, report_eva=True, remain_steps=None, save_loss=False, save_checkp_epoch=False, use_amp=False):

Expand Down Expand Up @@ -107,6 +107,8 @@ def train(td, tl, ed, nd, optm, lrsch, model, lossf, mv_device, logger, done_tok
_log_f_dynbatch.write(("%d %s\n" % (wd_add, " ".join(["%.2f" % (_cu,) for _cu in _cos_sim_l]))).encode("utf-8"))

if _perform_dyn_optm_step or (_done_tokens >= tokens_optm):
if not _perform_dyn_optm_step:
grad_mon.reset()
_do_optm_step = True if _cos_sim is None else (_cos_sim <= update_angle)
if _do_optm_step:
if log_dynb:
Expand Down
6 changes: 3 additions & 3 deletions transformer/Decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, a

self.ff = PositionwiseFF(isize, _fhsize, dropout, norm_residual)

self.layer_normer1 = nn.LayerNorm(isize, eps=ieps_ln_default)
self.layer_normer2 = nn.LayerNorm(isize, eps=ieps_ln_default)
self.layer_normer1 = nn.LayerNorm(isize, eps=ieps_ln_default, elementwise_affine=enable_ln_parameters)
self.layer_normer2 = nn.LayerNorm(isize, eps=ieps_ln_default, elementwise_affine=enable_ln_parameters)

self.drop = Dropout(dropout, inplace=True) if dropout > 0.0 else None

Expand Down Expand Up @@ -134,7 +134,7 @@ def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.

self.lsm = nn.LogSoftmax(-1)

self.out_normer = nn.LayerNorm(isize, eps=ieps_ln_default) if norm_output else None
self.out_normer = nn.LayerNorm(isize, eps=ieps_ln_default, elementwise_affine=enable_ln_parameters) if norm_output else None

self.fbl = None if forbidden_index is None else tuple(set(forbidden_index))

Expand Down
2 changes: 1 addition & 1 deletion transformer/Doc/Para/Base/Decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, a
super(DecoderLayer, self).__init__(isize, fhsize, dropout, attn_drop, num_head, _ahsize)

self.cattns = nn.ModuleList([CrossAttn(isize, _ahsize, isize, num_head, dropout=attn_drop) for i in range(ncross)])
self.cattn_ln = nn.ModuleList([nn.LayerNorm(isize, eps=ieps_ln_default) for i in range(ncross)])
self.cattn_ln = nn.ModuleList([nn.LayerNorm(isize, eps=ieps_ln_default, elementwise_affine=enable_ln_parameters) for i in range(ncross)])
self.grs = nn.ModuleList([GateResidual(isize) for i in range(ncross)])

def forward(self, inpute, inputo, inputc, src_pad_mask=None, tgt_pad_mask=None, context_mask=None, query_unit=None):
Expand Down
2 changes: 1 addition & 1 deletion transformer/Doc/Para/Base/Encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, a
super(CrossEncoderLayer, self).__init__(isize, fhsize, dropout, attn_drop, num_head, _ahsize)

self.cattns = nn.ModuleList([CrossAttn(isize, _ahsize, isize, num_head, dropout=attn_drop) for i in range(ncross)])
self.cattn_ln = nn.ModuleList([nn.LayerNorm(isize, eps=ieps_ln_default) for i in range(ncross)])
self.cattn_ln = nn.ModuleList([nn.LayerNorm(isize, eps=ieps_ln_default, elementwise_affine=enable_ln_parameters) for i in range(ncross)])
self.grs = nn.ModuleList([GateResidual(isize) for i in range(ncross)])

def forward(self, inputs, inputc, mask=None, context_mask=None):
Expand Down
2 changes: 1 addition & 1 deletion transformer/Doc/Para/Base/NMT.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def __init__(self, isize, snwd, tnwd, num_layer, fhsize=None, dropout=0.0, attn_

self.dec = Decoder(isize, tnwd, dec_layer, fhsize, dropout, attn_drop, emb_w, num_head, xseql, ahsize, norm_output, bindDecoderEmb, forbidden_index, nprev_context)

if use_k_relative_position > 0:
if rel_pos_enabled:
share_rel_pos_cache(self)

def forward(self, inpute, inputo, inputc, mask=None, context_mask=None):
Expand Down
4 changes: 2 additions & 2 deletions transformer/Encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, a

self.ff = PositionwiseFF(isize, _fhsize, dropout, norm_residual)

self.layer_normer = nn.LayerNorm(isize, eps=ieps_ln_default)
self.layer_normer = nn.LayerNorm(isize, eps=ieps_ln_default, elementwise_affine=enable_ln_parameters)

self.drop = Dropout(dropout, inplace=True) if dropout > 0.0 else None

Expand Down Expand Up @@ -91,7 +91,7 @@ def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.
else:
self.nets = nn.ModuleList([EncoderLayer(isize, _fhsize, dropout, attn_drop, num_head, _ahsize) for i in range(num_layer)])

self.out_normer = nn.LayerNorm(isize, eps=ieps_ln_default) if norm_output else None
self.out_normer = nn.LayerNorm(isize, eps=ieps_ln_default, elementwise_affine=enable_ln_parameters) if norm_output else None

# inputs: (bsize, seql)
# mask: (bsize, 1, seql), generated with:
Expand Down
Loading

0 comments on commit 56ca23b

Please sign in to comment.