joshsoftware · MiSayali · Feb 14, 2023 · Feb 14, 2023
diff --git a/fairseq_utility.py b/fairseq_utility.py
@@ -76,3 +76,28 @@ def fairseq_train(GPUs, preprocess_dir, save_dir, logfile, src, tgt, model='tran
         if GPUs is not None:
             cmd = 'CUDA_VISIBLE_DEVICES={}  {}'.format(GPUs, cmd)
         subprocess.run(cmd, shell=True)
+
+def fairseq_generate(GPUs, preprocess_dir, checkpoint_path, results_path, src, tgt, gen_subset='test', beam=10, nbest=1, max_len_a=1, max_len_b=50, remove_bpe=None, user_dir=None, use_Popen=True, **kwargs):
+    additional_cmds = ''.join([f"--{k.replace('_', '-')} {v} " for k, v in kwargs.items() if not isinstance(v, bool)])
+    additional_cmds += ''.join([f"--{k.replace('_', '-')} " for k, v in kwargs.items() if isinstance(v, bool) and v])
+    cmd = f"fairseq-generate \
+            {preprocess_dir} \
+        --source-lang {src} --target-lang {tgt} \
+        --gen-subset {gen_subset} \
+        --path {checkpoint_path} \
+        --max-len-a {max_len_a} \
+        --max-len-b {max_len_b} \
+        --nbest {nbest} \
+        --beam {beam} "
+    if remove_bpe is not None:
+        cmd += f'--remove-bpe {remove_bpe} '
+    if user_dir is not None:
+        cmd += f'--user-dir {user_dir} '
+    cmd += additional_cmds
+    if GPUs is not None:
+        cmd = 'CUDA_VISIBLE_DEVICES={}  {}'.format(GPUs, cmd)
+    with open(results_path, 'w') as f:
+        if use_Popen:
+            return subprocess.Popen(cmd, shell=True, stdout=f)
+        else:
+            return subprocess.run(cmd, shell=True, stdout=f)
diff --git a/run_corrector.py b/run_corrector.py
@@ -0,0 +1,45 @@
+import argparse
+from fairseq_utils import *
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--round_name')
+parser.add_argument('--destdir_root', default='')
+parser.add_argument('--gpu_ids', default='0', help='Comma separated list')
+args = parser.parse_args()
+args.gpu_ids = args.gpu_ids.split(",")
+
+data_dir = Path('/content/drive/MyDrive/syscan/data')
+round_dir = data_dir/args.round_name
+destdir_root = Path(args.destdir_root) if args.destdir_root else round_dir/'orig_bad'
+
+n_splits = 5  #all the original bad code is split into 5 chunks for faster processing
+
+#Preprocess inputs
+for split in range(n_splits):
+    destdir    = destdir_root/f'fairseq_preprocess__orig_bad.{split}'
+    if os.path.exists(str(destdir)):
+        continue
+    fairseq_preprocess(src='bad', tgt='good', workers=10,
+                          destdir  = str(destdir),
+                          testpref = str(data_dir/f'orig_bad_code/orig.{split}'),
+                          srcdict  = str(data_dir/'token_vocab.txt'),
+                          only_source=True )
+    os.system('cp {} {}'.format(data_dir/'token_vocab.txt', destdir/'dict.good.txt'))
+
+#Run corrector
+model_dir  = round_dir/'model-fixer'
+model_path = model_dir/'checkpoint.pt'
+gpus = (args.gpu_ids * (n_splits//len(args.gpu_ids) +1))[:n_splits]
+use_Popen = (len(args.gpu_ids) > 1)
+ps = []
+for split, gpu in zip(range(n_splits), gpus):
+    destdir    = destdir_root/f'fairseq_preprocess__orig_bad.{split}'
+    pred_path  = destdir/'model-fixer.pred.txt'
+    p = fairseq_generate(str(gpu), str(destdir), str(model_path), str(pred_path),
+                      src='bad', tgt='good', gen_subset='test', use_Popen=use_Popen,
+                      beam=10, nbest=10, max_len_a=1, max_len_b=50, max_tokens=7000)
+    ps.append(p)
+
+if use_Popen:
+    exit_codes = [p.wait() for p in ps]