diff --git a/fairseq_utility.py b/fairseq_utility.py index c5d093a..77a98d9 100644 --- a/fairseq_utility.py +++ b/fairseq_utility.py @@ -76,3 +76,28 @@ def fairseq_train(GPUs, preprocess_dir, save_dir, logfile, src, tgt, model='tran if GPUs is not None: cmd = 'CUDA_VISIBLE_DEVICES={} {}'.format(GPUs, cmd) subprocess.run(cmd, shell=True) + +def fairseq_generate(GPUs, preprocess_dir, checkpoint_path, results_path, src, tgt, gen_subset='test', beam=10, nbest=1, max_len_a=1, max_len_b=50, remove_bpe=None, user_dir=None, use_Popen=True, **kwargs): + additional_cmds = ''.join([f"--{k.replace('_', '-')} {v} " for k, v in kwargs.items() if not isinstance(v, bool)]) + additional_cmds += ''.join([f"--{k.replace('_', '-')} " for k, v in kwargs.items() if isinstance(v, bool) and v]) + cmd = f"fairseq-generate \ + {preprocess_dir} \ + --source-lang {src} --target-lang {tgt} \ + --gen-subset {gen_subset} \ + --path {checkpoint_path} \ + --max-len-a {max_len_a} \ + --max-len-b {max_len_b} \ + --nbest {nbest} \ + --beam {beam} " + if remove_bpe is not None: + cmd += f'--remove-bpe {remove_bpe} ' + if user_dir is not None: + cmd += f'--user-dir {user_dir} ' + cmd += additional_cmds + if GPUs is not None: + cmd = 'CUDA_VISIBLE_DEVICES={} {}'.format(GPUs, cmd) + with open(results_path, 'w') as f: + if use_Popen: + return subprocess.Popen(cmd, shell=True, stdout=f) + else: + return subprocess.run(cmd, shell=True, stdout=f) diff --git a/run_corrector.py b/run_corrector.py new file mode 100644 index 0000000..2cb77b2 --- /dev/null +++ b/run_corrector.py @@ -0,0 +1,45 @@ +import argparse +from fairseq_utils import * + + +parser = argparse.ArgumentParser() +parser.add_argument('--round_name') +parser.add_argument('--destdir_root', default='') +parser.add_argument('--gpu_ids', default='0', help='Comma separated list') +args = parser.parse_args() +args.gpu_ids = args.gpu_ids.split(",") + +data_dir = Path('/content/drive/MyDrive/syscan/data') +round_dir = data_dir/args.round_name +destdir_root = Path(args.destdir_root) if args.destdir_root else round_dir/'orig_bad' + +n_splits = 5 #all the original bad code is split into 5 chunks for faster processing + +#Preprocess inputs +for split in range(n_splits): + destdir = destdir_root/f'fairseq_preprocess__orig_bad.{split}' + if os.path.exists(str(destdir)): + continue + fairseq_preprocess(src='bad', tgt='good', workers=10, + destdir = str(destdir), + testpref = str(data_dir/f'orig_bad_code/orig.{split}'), + srcdict = str(data_dir/'token_vocab.txt'), + only_source=True ) + os.system('cp {} {}'.format(data_dir/'token_vocab.txt', destdir/'dict.good.txt')) + +#Run corrector +model_dir = round_dir/'model-fixer' +model_path = model_dir/'checkpoint.pt' +gpus = (args.gpu_ids * (n_splits//len(args.gpu_ids) +1))[:n_splits] +use_Popen = (len(args.gpu_ids) > 1) +ps = [] +for split, gpu in zip(range(n_splits), gpus): + destdir = destdir_root/f'fairseq_preprocess__orig_bad.{split}' + pred_path = destdir/'model-fixer.pred.txt' + p = fairseq_generate(str(gpu), str(destdir), str(model_path), str(pred_path), + src='bad', tgt='good', gen_subset='test', use_Popen=use_Popen, + beam=10, nbest=10, max_len_a=1, max_len_b=50, max_tokens=7000) + ps.append(p) + +if use_Popen: + exit_codes = [p.wait() for p in ps]