Skip to content
This repository has been archived by the owner on Aug 10, 2023. It is now read-only.

Commit

Permalink
August 2021 update
Browse files Browse the repository at this point in the history
  • Loading branch information
liuqiuhui2015 committed Aug 16, 2021
1 parent af848c9 commit 8e44a4c
Show file tree
Hide file tree
Showing 170 changed files with 5,585 additions and 1,017 deletions.
8 changes: 1 addition & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ We provide scripts to apply Byte-Pair Encoding (BPE) under `scripts/bpe/`.

### convert plain text to tensors for training

Generate training data for `train.py` with `bash scripts/mktrain.sh`, [configure variables](scripts/README.md#mktrainsh) in `scripts/mktrain.sh` for your usage (the other variables shall comply with those in `scripts/mkbpe.sh`).
Generate training data for `train.py` with `bash scripts/mktrain.sh`, [configure variables](scripts/README.md#mktrainsh) in `scripts/mktrain.sh` for your usage (the other variables shall comply with those in `scripts/bpe/mk.sh`).

## Configuration for training and testing

Expand Down Expand Up @@ -120,9 +120,3 @@ Details of this project can be found [here](https://arxiv.org/abs/1903.07402), a
pdf = {https://arxiv.org/pdf/1903.07402}
}
```

## Contributor(s)

## Need more?

Every details are in those codes, just explore them and make commits ;-)
2 changes: 0 additions & 2 deletions adv/predict/doc/para/predict_doc_para.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,7 @@ def load_fixing(module):
mymodel = DataParallelMT(mymodel, device_ids=cuda_devices, output_device=cuda_device.index, host_replicate=True, gather_output=False)

#num_prev_sent = cnfg.num_prev_sent

beam_size = cnfg.beam_size

length_penalty = cnfg.length_penalty

ens = "\n".encode("utf-8")
Expand Down
1 change: 0 additions & 1 deletion adv/predict/predict_ape.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ def load_fixing(module):
mymodel = DataParallelMT(mymodel, device_ids=cuda_devices, output_device=cuda_device.index, host_replicate=True, gather_output=False)

beam_size = cnfg.beam_size

length_penalty = cnfg.length_penalty

ens = "\n".encode("utf-8")
Expand Down
50 changes: 19 additions & 31 deletions adv/train/doc/para/train_doc_para.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from utils.base import *
from utils.init import init_model_params
from utils.contpara import get_model_parameters
from utils.h5serial import h5save, h5load
from utils.fmt.base import tostr, save_states, load_states, pad_id
from utils.fmt.base4torch import parse_cuda, load_emb
Expand All @@ -22,9 +23,6 @@

from tqdm import tqdm

from os import makedirs
from os.path import exists as p_check

import h5py

import cnfg.docpara as cnfg
Expand Down Expand Up @@ -77,7 +75,7 @@ def train(td, tl, ed, nd, optm, lrsch, model, lossf, mv_device, logger, done_tok
_done_tokens += wd_add

if _done_tokens >= tokens_optm:
optm_step(optm, model=model, scaler=scaler, multi_gpu=multi_gpu, multi_gpu_optimizer=multi_gpu_optimizer)
optm_step(optm, model=model, scaler=scaler, multi_gpu=multi_gpu, multi_gpu_optimizer=multi_gpu_optimizer, zero_grad_none=optm_step_zero_grad_set_none)
_done_tokens = 0
if _cur_rstep is not None:
if save_checkp_epoch and (save_every is not None) and (_cur_rstep % save_every == 0) and (chkpf is not None) and (_cur_rstep > 0):
Expand All @@ -90,7 +88,7 @@ def train(td, tl, ed, nd, optm, lrsch, model, lossf, mv_device, logger, done_tok
else:
_chkpf = chkpf
_chkpof = chkpof
save_model(model, _chkpf, multi_gpu, logger)
save_model(model, _chkpf, multi_gpu, print_func=logger.info)
if chkpof is not None:
h5save(optm.state_dict(), _chkpof)
if statesf is not None:
Expand Down Expand Up @@ -124,7 +122,7 @@ def train(td, tl, ed, nd, optm, lrsch, model, lossf, mv_device, logger, done_tok
else:
_chkpf = chkpf
_chkpof = chkpof
save_model(model, _chkpf, multi_gpu, logger)
save_model(model, _chkpf, multi_gpu, print_func=logger.info)
if chkpof is not None:
h5save(optm.state_dict(), _chkpof)
if statesf is not None:
Expand Down Expand Up @@ -183,32 +181,23 @@ def load_fixing(module):
module.fix_load()

rid = cnfg.run_id

earlystop = cnfg.earlystop

maxrun = cnfg.maxrun

tokens_optm = cnfg.tokens_optm

done_tokens = 0

batch_report = cnfg.batch_report
report_eva = cnfg.report_eva

use_ams = cnfg.use_ams

save_optm_state = cnfg.save_optm_state

save_auto_clean = cnfg.save_auto_clean
overwrite_eva = cnfg.overwrite_eva
save_every = cnfg.save_every
start_chkp_save = cnfg.epoch_start_checkpoint_save

epoch_save = cnfg.epoch_save

remain_steps = cnfg.training_steps

wkdir = "".join((cnfg.exp_dir, cnfg.data_id, "/", cnfg.group_id, "/", rid, "/"))
if not p_check(wkdir):
makedirs(wkdir)
mkdir(wkdir)

chkpf = None
chkpof = None
Expand Down Expand Up @@ -270,7 +259,7 @@ def load_fixing(module):
lossf.to(cuda_device)

optimizer = Optimizer(filter_para_grad(mymodel.parameters()), lr=init_lr, betas=adam_betas_default, eps=ieps_adam_default, weight_decay=cnfg.weight_decay, amsgrad=use_ams)
optimizer.zero_grad(set_to_none=True)
optimizer.zero_grad(set_to_none=optm_step_zero_grad_set_none)

use_amp = cnfg.use_amp and use_cuda
scaler = (MultiGPUGradScaler() if multi_gpu_optimizer else GradScaler()) if use_amp else None
Expand All @@ -279,12 +268,11 @@ def load_fixing(module):
mymodel = DataParallelMT(mymodel, device_ids=cuda_devices, output_device=cuda_device.index, host_replicate=True, gather_output=False)
lossf = DataParallelCriterion(lossf, device_ids=cuda_devices, output_device=cuda_device.index, replicate_once=True)

if multi_gpu_optimizer:
optimizer = mymodel.build_optimizer(Optimizer, lr=init_lr, betas=adam_betas_default, eps=ieps_adam_default, weight_decay=cnfg.weight_decay, amsgrad=use_ams)
mymodel.zero_grad(set_to_none=True)
if multi_gpu:
optimizer = mymodel.build_optimizer(Optimizer, lr=init_lr, betas=adam_betas_default, eps=ieps_adam_default, weight_decay=cnfg.weight_decay, amsgrad=use_ams, multi_gpu_optimizer=multi_gpu_optimizer, contiguous_parameters=contiguous_parameters)
else:
optimizer = Optimizer((mymodel.module if multi_gpu else mymodel).parameters(), lr=init_lr, betas=adam_betas_default, eps=ieps_adam_default, weight_decay=cnfg.weight_decay, amsgrad=use_ams)
optimizer.zero_grad(set_to_none=True)
optimizer = Optimizer(get_model_parameters(mymodel, contiguous_parameters=contiguous_parameters), lr=init_lr, betas=adam_betas_default, eps=ieps_adam_default, weight_decay=cnfg.weight_decay, amsgrad=use_ams)
optimizer.zero_grad(set_to_none=optm_step_zero_grad_set_none)

fine_tune_state = cnfg.fine_tune_state
if fine_tune_state is not None:
Expand All @@ -302,16 +290,16 @@ def load_fixing(module):
logger.info("".join(("Init lr: ", ",".join(tostr(getlr(optimizer))), ", Dev Loss/Error: %.3f %.2f" % (minloss, minerr))))

if fine_tune_m is None:
save_model(mymodel, wkdir + "init.h5", multi_gpu, logger)
save_model(mymodel, wkdir + "init.h5", multi_gpu, print_func=logger.info)
logger.info("Initial model saved")
else:
cnt_states = cnfg.train_statesf
if (cnt_states is not None) and p_check(cnt_states):
if cnt_states is not None:
logger.info("Continue last epoch")
tminerr, done_tokens, cur_checkid, remain_steps, _ = train(td, load_states(cnt_states), vd, vl, optimizer, lrsch, mymodel, lossf, cuda_device, logger, done_tokens, multi_gpu, tokens_optm, batch_report, save_every, chkpf, chkpof, statesf, num_checkpoint, cur_checkid, report_eva, remain_steps, False, False, scaler)
vloss, vprec = eva(vd, vl, mymodel, lossf, cuda_device, multi_gpu, use_amp)
logger.info("Epoch: 0, train loss: %.3f, valid loss/error: %.3f %.2f" % (tminerr, vloss, vprec))
save_model(mymodel, wkdir + "train_0_%.3f_%.3f_%.2f.h5" % (tminerr, vloss, vprec), multi_gpu, logger)
save_model(mymodel, wkdir + "train_0_%.3f_%.3f_%.2f.h5" % (tminerr, vloss, vprec), multi_gpu, print_func=logger.info, mtyp=("eva" if overwrite_eva else "train") if save_auto_clean else None)
if save_optm_state:
h5save(optimizer.state_dict(), wkdir + "train_0_%.3f_%.3f_%.2f.optm.h5" % (tminerr, vloss, vprec))
logger.info("New best model saved")
Expand Down Expand Up @@ -340,7 +328,7 @@ def load_fixing(module):
logger.info("Epoch: %d, train loss: %.3f, valid loss/error: %.3f %.2f" % (i, terr, vloss, vprec))

if (vprec <= minerr) or (vloss <= minloss):
save_model(mymodel, wkdir + "eva_%d_%.3f_%.3f_%.2f.h5" % (i, terr, vloss, vprec), multi_gpu, logger)
save_model(mymodel, wkdir + "eva_%d_%.3f_%.3f_%.2f.h5" % (i, terr, vloss, vprec), multi_gpu, print_func=logger.info, mtyp="eva" if save_auto_clean else None)
if save_optm_state:
h5save(optimizer.state_dict(), wkdir + "eva_%d_%.3f_%.3f_%.2f.optm.h5" % (i, terr, vloss, vprec))
logger.info("New best model saved")
Expand All @@ -355,11 +343,11 @@ def load_fixing(module):
else:
if terr < tminerr:
tminerr = terr
save_model(mymodel, wkdir + "train_%d_%.3f_%.3f_%.2f.h5" % (i, terr, vloss, vprec), multi_gpu, logger)
save_model(mymodel, wkdir + "train_%d_%.3f_%.3f_%.2f.h5" % (i, terr, vloss, vprec), multi_gpu, print_func=logger.info, mtyp=("eva" if overwrite_eva else "train") if save_auto_clean else None)
if save_optm_state:
h5save(optimizer.state_dict(), wkdir + "train_%d_%.3f_%.3f_%.2f.optm.h5" % (i, terr, vloss, vprec))
elif epoch_save:
save_model(mymodel, wkdir + "epoch_%d_%.3f_%.3f_%.2f.h5" % (i, terr, vloss, vprec), multi_gpu, logger)
save_model(mymodel, wkdir + "epoch_%d_%.3f_%.3f_%.2f.h5" % (i, terr, vloss, vprec), multi_gpu, print_func=logger.info)

namin += 1
if namin >= earlystop:
Expand All @@ -385,7 +373,7 @@ def load_fixing(module):
if done_tokens > 0:
optm_step(optimizer, model=mymodel, scaler=scaler, multi_gpu=multi_gpu, multi_gpu_optimizer=multi_gpu_optimizer)

save_model(mymodel, wkdir + "last.h5", multi_gpu, logger)
save_model(mymodel, wkdir + "last.h5", multi_gpu, print_func=logger.info)
if save_optm_state:
h5save(optimizer.state_dict(), wkdir + "last.optm.h5")
logger.info("model saved")
Expand Down
Loading

0 comments on commit 8e44a4c

Please sign in to comment.