From ffdbe6770836d56d9670f3a3ff0c60036eaf4f34 Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Tue, 30 Apr 2019 13:42:39 +0200 Subject: [PATCH 01/46] fixes to cnn embedding --- pie/models/embedding.py | 47 +++++++++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/pie/models/embedding.py b/pie/models/embedding.py index 8eec432..da262a9 100644 --- a/pie/models/embedding.py +++ b/pie/models/embedding.py @@ -6,16 +6,40 @@ from pie import torch_utils from pie import initialization -from .encoder import RNNEncoder from .lstm import CustomBiLSTM +class Highway(torch.nn.Module): + def __init__(self, input_dim, num_layers=1, activation=torch.nn.functional.relu): + super(Highway, self).__init__() + + self.layers = torch.nn.ModuleList( + [torch.nn.Linear(input_dim, input_dim * 2) for _ in range(num_layers)]) + self.activation = activation + + for layer in self.layers: + layer.bias[input_dim:].data.fill_(1) + + def forward(self, inputs): + current_input = inputs + + for layer in self.layers: + projected_input = layer(current_input) + linear_part = current_input + nonlinear_part, gate = projected_input.chunk(2, dim=-1) + nonlinear_part = self.activation(nonlinear_part) + gate = torch.sigmoid(gate) + current_input = gate * linear_part + (1 - gate) * nonlinear_part + + return current_input + + class CNNEmbedding(nn.Module): """ Character-level Embeddings with Convolutions following Kim 2014. """ def __init__(self, num_embeddings, embedding_dim, padding_idx=None, - kernel_sizes=(5, 4, 3), out_channels=100): + highway_layers=2, kernel_sizes=(5, 4, 3), out_channels=32): self.num_embeddings = num_embeddings self.embedding_dim = out_channels * len(kernel_sizes) self.kernel_sizes = kernel_sizes @@ -27,12 +51,17 @@ def __init__(self, num_embeddings, embedding_dim, padding_idx=None, convs = [] for W in kernel_sizes: - wide_pad = (0, (W-1) // 2) - conv = nn.Conv2d( - 1, out_channels, (embedding_dim, W), padding=wide_pad) + padding = ((W//2) - 1, W - (W//2), 0, 0) + conv = torch.nn.Sequential( + torch.nn.ZeroPad2d(padding), + torch.nn.Conv2d(1, out_channels, (embedding_dim, W))) convs.append(conv) self.convs = nn.ModuleList(convs) + self.highway = None + if highway_layers > 0: + self.highway = Highway(self.embedding_dim, highway_layers) + self.init() def init(self): @@ -47,17 +76,17 @@ def forward(self, char, nchars, nwords): emb = emb.transpose(1, 2) # (batch x emb_dim x seq_len) emb = emb.unsqueeze(1) # (batch x 1 x emb_dim x seq_len) - conv_outs, maxlen = [], 0 + conv_outs = [] for conv in self.convs: # (batch x C_o x seq_len) conv_outs.append(F.relu(conv(emb).squeeze(2))) - maxlen = max(maxlen, conv_outs[-1].size(2)) - conv_outs = [F.pad(out, (0, maxlen - out.size(2))) for out in conv_outs] # (batch * nwords x C_o * len(kernel_sizes) x seq_len) conv_outs = torch.cat(conv_outs, dim=1) # (batch * nwords x C_o * len(kernel_sizes) x 1) - conv_out = F.max_pool1d(conv_outs, maxlen).squeeze(2) + conv_out = F.max_pool1d(conv_outs, conv_outs.size(2)).squeeze(2) + if self.highway is not None: + conv_out = self.highway(conv_out) conv_out = torch_utils.pad_flat_batch( conv_out, nwords, maxlen=max(nwords).item()) From a35e89bba2440efc7be6c7017db1c2724ad4aa19 Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Tue, 30 Apr 2019 13:44:02 +0200 Subject: [PATCH 02/46] fixes --- pie/data/dataset.py | 6 ++-- pie/models/decoder.py | 80 +++++++++---------------------------------- 2 files changed, 19 insertions(+), 67 deletions(-) diff --git a/pie/data/dataset.py b/pie/data/dataset.py index e0fc62c..b2ce1f0 100644 --- a/pie/data/dataset.py +++ b/pie/data/dataset.py @@ -22,14 +22,14 @@ class LabelEncoder(object): """ def __init__(self, level='token', name=None, target=None, lower=False, preprocessor=None, max_size=None, min_freq=1, - pad=True, eos=False, bos=False, **meta): + pad=True, eos=False, bos=False, reserved=(), **meta): if level.lower() not in ('token', 'char'): raise ValueError("`level` must be 'token' or 'char'. Got ", level) self.meta = meta # dictionary with other task-relevant information - self.eos = constants.EOS if eos else None self.pad = constants.PAD if pad else None + self.eos = constants.EOS if eos else None self.bos = constants.BOS if bos else None self.lower = lower self.preprocessor = preprocessor @@ -40,7 +40,7 @@ def __init__(self, level='token', name=None, target=None, lower=False, self.level = level.lower() self.target = target self.name = name - self.reserved = (constants.UNK,) # always use + self.reserved = reserved + (constants.UNK,) # always use self.reserved += tuple([sym for sym in [self.eos, self.pad, self.bos] if sym]) self.freqs = Counter() self.known_tokens = set() # for char-level dicts, keep word-level known tokens diff --git a/pie/models/decoder.py b/pie/models/decoder.py index d7ba31c..9966721 100644 --- a/pie/models/decoder.py +++ b/pie/models/decoder.py @@ -263,8 +263,8 @@ def __init__(self, label_encoder, in_dim, hidden_size, scorer='general', self.init_rnn = init_rnn super().__init__() - if label_encoder.get_eos() is None and label_encoder.get_bos() is None: - raise ValueError("AttentionalDecoder needs at least one of or ") + if label_encoder.get_eos() is None or label_encoder.get_bos() is None: + raise ValueError("AttentionalDecoder needs and ") nll_weight = torch.ones(len(label_encoder)) nll_weight[label_encoder.get_pad()] = 0. @@ -292,11 +292,6 @@ def forward(self, targets, lengths, enc_outs, src_lengths, context=None): the targets for the `loss` method. Takes care of padding. """ targets, lengths = targets[:-1], lengths - 1 - if self.label_encoder.get_bos() is None: # needs prepad - targets = torch_utils.pad( - targets, pad=self.label_encoder.get_eos(), pos='pre') - lengths += 1 - embs = self.embs(targets) if self.context_dim > 0: @@ -326,8 +321,7 @@ def loss(self, logits, targets): logits : tensor(seq_len x batch x vocab) targets : tensor(seq_len x batch) """ - if self.label_encoder.get_bos() is not None: - targets = targets[1:] # remove from targets + targets = targets[1:] # remove from targets loss = F.cross_entropy( logits.view(-1, len(self.label_encoder)), targets.view(-1), @@ -338,52 +332,8 @@ def loss(self, logits, targets): return loss - def predict_sequence(self, enc_outs, lengths, context=None): - """ - Decoding routine with step-wise argmax for fixed output lengths - - Parameters - =========== - enc_outs : tensor(src_seq_len x batch x hidden_size) - context : tensor(batch x hidden_size), optional - """ - hidden = None - batch = enc_outs.size(1) - device = enc_outs.device - mask = torch.ones(batch, dtype=torch.int64, device=device) - inp = torch.zeros(batch, dtype=torch.int64, device=device) - inp += self.label_encoder.get_eos() - hyps, scores = [], 0 - - for i in range(max(lengths.tolist())): - # prepare input - emb = self.embs(inp) - if context is not None: - emb = torch.cat([emb, context], dim=1) - # run rnn - emb = emb.unsqueeze(0) - outs, hidden = self.rnn(emb, hidden) - outs, _ = self.attn(outs, enc_outs, lengths) - outs = self.proj(outs).squeeze(0) - # get logits - probs = F.log_softmax(outs, dim=1) - # sample and accumulate - score, inp = probs.max(1) - hyps.append(inp.tolist()) - mask = mask * (i != lengths).long() - score[mask == 0] = 0 - scores += score - - # remove / if given - start = 1 if self.label_encoder.get_bos() else 0 - lengths = lengths - 1 if self.label_encoder.get_eos() else lengths - hyps = [self.label_encoder.inverse_transform(hyp[start:length]) - for hyp, length in zip(zip(*hyps), lengths.tolist())] - scores = (scores / lengths.float()).tolist() - - return hyps, scores - - def predict_max(self, enc_outs, lengths, context=None, max_seq_len=20): + def predict_max(self, enc_outs, lengths, context=None, max_seq_len=20, + bos=None, eos=None): """ Decoding routine for inference with step-wise argmax procedure @@ -392,10 +342,11 @@ def predict_max(self, enc_outs, lengths, context=None, max_seq_len=20): enc_outs : tensor(src_seq_len x batch x hidden_size) context : tensor(batch x hidden_size), optional """ + eos = eos or self.label_encoder.get_eos() + bos = bos or self.label_encoder.get_bos() hidden, batch, device = None, enc_outs.size(1), enc_outs.device mask = torch.ones(batch, dtype=torch.int64, device=device) - inp = torch.zeros(batch, dtype=torch.int64, device=device) - inp += self.label_encoder.get_bos() + inp = torch.zeros(batch, dtype=torch.int64, device=device) + bos hyps, scores = [], 0 for _ in range(max_seq_len): @@ -416,7 +367,7 @@ def predict_max(self, enc_outs, lengths, context=None, max_seq_len=20): # sample and accumulate score, inp = probs.max(1) hyps.append(inp.tolist()) - mask = mask * (inp != self.label_encoder.get_eos()).long() + mask = mask * (inp != eos).long() score = score.cpu() score[mask == 0] = 0 scores += score @@ -426,7 +377,8 @@ def predict_max(self, enc_outs, lengths, context=None, max_seq_len=20): return hyps, scores - def predict_beam(self, enc_outs, lengths, context=None, max_seq_len=50, width=12): + def predict_beam(self, enc_outs, lengths, context=None, max_seq_len=50, width=12, + eos=None, bos=None): """ Decoding routine for inference with beam search @@ -435,11 +387,11 @@ def predict_beam(self, enc_outs, lengths, context=None, max_seq_len=50, width=12 enc_outs : tensor(src_seq_len x batch x hidden_size) context : tensor(batch x hidden_size), optional """ - hidden = None - (seq_len, batch, _), device = enc_outs.size(), enc_outs.device - beams = [Beam(width, eos=self.label_encoder.get_eos(), - bos=self.label_encoder.get_bos(), device=device) - for _ in range(batch)] + eos = eos or self.label_encoder.get_eos() + bos = bos or self.label_encoder.get_bos() + hidden, device = None, enc_outs.device + seq_len, batch, _ = enc_outs.size() + beams = [Beam(width, eos=eos, bos=bos, device=device) for _ in range(batch)] # expand data along beam width # (seq_len x beam * batch x hidden_size) From 6e2baf2c4c7506ca4defdb9b18dbd80dfeffe8b9 Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Tue, 30 Apr 2019 14:20:50 +0200 Subject: [PATCH 03/46] cnn embedding fix --- pie/models/embedding.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pie/models/embedding.py b/pie/models/embedding.py index da262a9..d9978ac 100644 --- a/pie/models/embedding.py +++ b/pie/models/embedding.py @@ -66,7 +66,10 @@ def __init__(self, num_embeddings, embedding_dim, padding_idx=None, def init(self): initialization.init_embeddings(self.emb) - for conv in self.convs: + for conv_seq in self.convs: + for conv in conv_seq: + if isinstance(conv, torch.nn.ZeroPad2d): + continue initialization.init_conv(conv) def forward(self, char, nchars, nwords): From ed7eca17276c78e49e6ef95a74dd61cd5e381d1c Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Mon, 6 May 2019 11:32:06 +0200 Subject: [PATCH 04/46] minor fixes --- pie/models/decoder.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pie/models/decoder.py b/pie/models/decoder.py index 9966721..fa0be77 100644 --- a/pie/models/decoder.py +++ b/pie/models/decoder.py @@ -133,8 +133,8 @@ def init(self): def forward(self, enc_outs): "get logits of the input features" # (seq_len x batch x vocab) - if self.highway is None: - enc_out = self.highway(enc_outs) + if self.highway is not None: + enc_outs = self.highway(enc_outs) logits = self.projection(enc_outs) return F.log_softmax(logits, -1) @@ -225,7 +225,7 @@ def predict(self, enc_outs, lengths): tag_sequence = logits.new(seq_len + 2, vocab + 2) # iterate over batches - for logits_b, len_b in zip(logits.t(), lengths): + for logits_b, len_b in zip(logits.transpose(0, 1), lengths): seq_len = len_b.item() # get this batch logits tag_sequence.fill_(-10000) From 104c40b3bd6f78e25b91ea8d4406cf0196d2c799 Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Wed, 8 May 2019 10:08:19 +0200 Subject: [PATCH 05/46] removed processing files --- .../historical/process_bab_versie_2.1alpha.py | 58 ------------- datasets/historical/process_geste.py | 58 ------------- datasets/historical/process_goo300k.py | 43 ---------- datasets/historical/process_llct.py | 67 --------------- datasets/historical/process_rem.py | 85 ------------------- datasets/historical/process_ren.py | 57 ------------- 6 files changed, 368 deletions(-) delete mode 100644 datasets/historical/process_bab_versie_2.1alpha.py delete mode 100644 datasets/historical/process_geste.py delete mode 100644 datasets/historical/process_goo300k.py delete mode 100644 datasets/historical/process_llct.py delete mode 100644 datasets/historical/process_rem.py delete mode 100644 datasets/historical/process_ren.py diff --git a/datasets/historical/process_bab_versie_2.1alpha.py b/datasets/historical/process_bab_versie_2.1alpha.py deleted file mode 100644 index 6779536..0000000 --- a/datasets/historical/process_bab_versie_2.1alpha.py +++ /dev/null @@ -1,58 +0,0 @@ - -from lxml import etree -import os -import random -random.seed(1001) - -root = 'bab_versie_2.1alpha' -# make devsplit -train = os.listdir(os.path.join(root, 'train')) -random.shuffle(train) - - -TEI = 'http://www.tei-c.org/ns/1.0' -UNRESOLVED = 'UNRESOLVED' # it's already use for complex cases - - -def readlines(path): - with open(path) as f: - tree = etree.fromstring(f.read()).getroottree() - for w in tree.findall('//tei:w', namespaces={'tei': TEI}): - if "misAlignment" in w.attrib: - # continue - continue - token, lemma, pos = w.text, w.attrib['lemma'], w.attrib['pos'] - # resort to the token for numbers (otherwise you get full number in chars) - if pos == 'TW' and token.isdecimal(): - lemma = token - - # some few tokens have weird whitespace: "' '" - token = token.replace(' ', '_') - - # some few cases have no lemma, use token instead - lemma = lemma or UNRESOLVED - if ' ' in lemma: - # complex cases (mostly proper names and lemmas with question marks) - lemma = UNRESOLVED - # some few cases have no pos, use UNK instead - pos = pos or UNRESOLVED - yield '{}\t{}\t{}\n'.format(token, lemma, pos) - - -five = int(len(train) * 0.05) -for split, files in {'dev': train[:five], 'train': train[five:]}.items(): - with open('{}/{}.tsv'.format(root, split), 'w') as f: - f.write('token\tlemma\tpos\n') - for inf in files: - # print("***") - # print(inf) - for line in readlines(os.path.join(root, 'train', inf)): - f.write(line) - - -# process test files -with open('{}/test.tsv'.format(root), 'w') as f: - f.write('token\tlemma\tpos\n') - for inf in os.listdir(os.path.join(root, 'test')): - for line in readlines(os.path.join(root, 'test', inf)): - f.write(line) diff --git a/datasets/historical/process_geste.py b/datasets/historical/process_geste.py deleted file mode 100644 index 68973f0..0000000 --- a/datasets/historical/process_geste.py +++ /dev/null @@ -1,58 +0,0 @@ - -# Warning: make sure to use commit: 9c1fd27 from https://github.com/Jean-Baptiste-Camps/Geste -import os -import random -random.seed(1001) - -root = './geste/' - -files = [f for f in os.listdir(root) if f.endswith('csv')] -random.shuffle(files) - - -def subsets(nsents=100): - subsets = [] - for f in files: - subset = [] - with open(os.path.join(root, f)) as f: - sent = [] - for line in f: - if len(subset) == nsents: - subsets.append(subset) - subset = [] - if not line.strip(): - sent.append("\n") - subset.append(sent) - sent = [] - else: - sent.append(line) - - if subset: - subsets.append(subset) - - return subsets - - -subsets = subsets() -five, ten = round(len(subsets) * 0.05), round(len(subsets) * 0.1) -random.shuffle(subsets) - - -def writesubsets(path, subsets): - with open(path, 'w') as f: - f.write("token\tlemma\tpos\tmorph\n") - for subset in subsets: - for sent in subset: - for line in sent: - f.write(line) - - -target = 'corpus' -if not os.path.isdir(os.path.join(root, target)): - os.makedirs(os.path.join(root, target)) -for split, subsets in { - 'dev': subsets[:five], - 'test': subsets[five:five+ten], - 'train': subsets[five+ten:] -}.items(): - writesubsets(os.path.join(root, target, '{}.tsv'.format(split)), subsets) diff --git a/datasets/historical/process_goo300k.py b/datasets/historical/process_goo300k.py deleted file mode 100644 index 9870620..0000000 --- a/datasets/historical/process_goo300k.py +++ /dev/null @@ -1,43 +0,0 @@ - -import os -from lxml import etree -import random -random.seed(1001) - -TEI = 'http://www.tei-c.org/ns/1.0' - -root = 'goo300k' -if not os.path.isdir(root): - os.makedirs(root) -path = './Reference corpus of historical Slovene goo300k 1.2/goo300k-vert/goo300k.vert' -with open(path) as f: - tree = etree.fromstring(f.read().replace('', '')).getroottree() - -sents = tree.findall('//tei:s', namespaces={'tei': TEI}) - - -def process_sent(sent): - for line in sent.text.strip().split('\n'): - line = line.strip() - if not line: - continue - token, _, lemma, pos, *_ = line.split('\t') - yield token, lemma, pos - - -random.shuffle(sents) -five = int(len(sents) * 0.05) - -for split, sents in {'dev': sents[:five], - 'test': sents[five:(five*2)+five], - 'train': sents[(five*2)+five:]}.items(): - with open(os.path.join(root, '{}.tsv'.format(split)), 'w') as f: - f.write('{}\t{}\t{}\n'.format("token", "lemma", "pos")) - for sent in sents: - line = list(process_sent(sent)) - if len(line) == 0: - print("Empty line") - continue - for token, lemma, pos in line: - f.write('{}\t{}\t{}\n'.format(token, lemma, pos)) - f.write('\n') diff --git a/datasets/historical/process_llct.py b/datasets/historical/process_llct.py deleted file mode 100644 index 946df34..0000000 --- a/datasets/historical/process_llct.py +++ /dev/null @@ -1,67 +0,0 @@ - -import os -from lxml.etree import XMLParser, parse -import random -random.seed(1001) - -ALDT = 'http://ufal.mff.cuni.cz/pdt/pml/' - -parser = XMLParser(huge_tree=True) -target = 'LLCT1' -root = './LLCT1.xml' -roottree = parse(root, parser=parser) - -OMIT = set('id relation seg status form lemma pos'.split()) - - -def parsetree(tree): - sent = [] - for node in sorted(tree.iterdescendants(), key=lambda node: int(node.attrib['id'])): - token, lemma = node.attrib['form'], node.attrib['lemma'] - if 'pos' not in node.attrib: - pos = 'punc' - else: - pos = node.attrib['pos'] - if ' ' in token: # some tokens are like ".... ...." - token = '...' - assert token - assert lemma - assert pos - assert " " not in token - assert " " not in lemma - assert " " not in pos - morph = '|'.join('{}={}'.format(k, v) - for k, v in sorted(node.attrib.items()) if k not in OMIT) - sent.append((token, lemma, pos, morph)) - return sent - - -def parsetrees(roottree): - for tree in roottree.xpath( - '//aldt:LM[not(ancestor::aldt:LM)]', namespaces={'aldt': ALDT}): - yield parsetree(tree) - - -if __name__ == '__main__': - # trees = roottree.xpath('//aldt:LM[not(ancestor::aldt:LM)]', namespaces={'aldt': ALDT}) - # import collections - # counts = {k: collections.Counter() - # for k in 'document_id subdoc date place scribe type'.split()} - # for tree in trees: - # for k in counts: - # counts[k][tree.attrib[k]] += 1 - - trees = list(parsetrees(roottree)) - random.shuffle(trees) - five = int(len(trees) * 0.05) - if not os.path.isdir(target): - os.makedirs(target) - for split, sents in {'dev': trees[:five], - 'test': trees[five:3*five], - 'train': trees[3*five:]}.items(): - with open(os.path.join(target, '{}.tsv'.format(split)), 'w+') as f: - f.write('token\tlemma\tpos\tmorph\n') - for sent in sents: - for token, lemma, pos, morph in sent: - f.write('{}\t{}\t{}\t{}\n'.format(token, lemma, pos, morph)) - f.write('\n') diff --git a/datasets/historical/process_rem.py b/datasets/historical/process_rem.py deleted file mode 100644 index f1047fe..0000000 --- a/datasets/historical/process_rem.py +++ /dev/null @@ -1,85 +0,0 @@ - -import shutil -import os -from lxml import etree -import random -random.seed(100101) - - -root = 'rem-corralled-20161222' -target = 'rem' - - -def readlines(path): - with open(path, 'rb') as f: - tree = etree.fromstring(f.read()).getroottree() - for token in tree.findall('token'): - # there are 8 cases were tok_dipl is empty (resort to trans) - form = token.find('tok_dipl').attrib['utf'] or token.attrib['trans'] - anno = token.find('tok_anno') - pos = anno.find('pos').attrib['tag'] - if token.attrib['type'] == 'punc': - # punctuation - lemma = form - else: - lemma = anno.find('lemma').attrib['tag'] - # there is 1 case where lemma has whitespace: "nâh sup" - if len(lemma.split()) > 1: - print("Substituting complex lemma:", lemma) - lemma = lemma.split()[0] - - yield form, lemma, pos - - -def make_subcorpus(subcorpus, **criteria): - files = [] - for f in os.listdir(root): - if not f.endswith('xml'): - continue - - with open(os.path.join(root, f), 'rb') as inf: - header = etree.fromstring(inf.read()).getroottree().find('header') - - is_in = True - for crit, val in criteria.items(): - if header.find(crit).text != val: - is_in = False - if is_in: - files.append(f) - - random.shuffle(files) - five, ten = int(len(files) * 0.05), int(len(files) * 0.1) - files = { - 'test': files[:ten], - 'dev': files[ten: ten+five], - 'train': files[ten+five:] - } - for split, files in files.items(): - subcorpuspath = os.path.join(target, subcorpus, 'splits', split) - if not os.path.isdir(subcorpuspath): - os.makedirs(subcorpuspath) - - with open(os.path.join(target, subcorpus, split + ".tsv"), 'w') as outf: - outf.write('token\tlemma\tpos\n') - for f in files: - shutil.copy( - os.path.join(root, f), - os.path.join(target, subcorpus, 'splits', split, f)) - for form, lemma, pos in readlines(os.path.join(root, f)): - outf.write("{}\t{}\t{}\n".format(form, lemma, pos)) - - -if __name__ == '__main__': - make_subcorpus("poesie", topic="Poesie") - make_subcorpus("recht", topic="Recht") - make_subcorpus("religion", topic="Religion") - -# count words per category -# counters = {k: Counter() for k in 'topic text-type genre language language-type language-region language-area time corpus'.split()} - -# for f in files: -# # nwords = len(list(readlines(os.path.join(root, f)))) -# with open(os.path.join(root, f), 'rb') as f: -# header = etree.fromstring(f.read()).getroottree().find('header') -# for k in counters: -# counters[k][header.find(k).text] += 1 diff --git a/datasets/historical/process_ren.py b/datasets/historical/process_ren.py deleted file mode 100644 index 2e2304c..0000000 --- a/datasets/historical/process_ren.py +++ /dev/null @@ -1,57 +0,0 @@ - -import os -from lxml import etree -import random -random.seed(100101) - -root = 'ReN_2018-07-23' -target = 'ren' -if not os.path.isdir(target): - os.makedirs(target) - - -def readlines(path): - with open(path, 'rb') as f: - tree = etree.fromstring(f.read()).getroottree() - for token in tree.findall('token'): - # there are 8 cases were tok_dipl is empty (resort to trans) - form = token.find('dipl').attrib['utf'] or token.attrib['trans'] - anno = token.find('anno') - pos = anno.find('pos').attrib['tag'] - lemma = anno.find('lemma').attrib['tag'] - morph = anno.find('morph').attrib['tag'] - # there are 3 cases where lemma has whitespace: "wager man" - if len(lemma.split()) > 1: - print("Substituting complex lemma:", lemma) - lemma = lemma.split()[0] - # there are 5 cases with empty lemma - if not lemma: - lemma = "" - - yield form, lemma, pos, morph - - -def writelines(f, lines): - for token, lemma, pos, morph in lines: - f.write("{}\t{}\t{}\t{}\n".format(token, lemma, pos, morph)) - - -files = os.listdir(root) -random.shuffle(files) -formatter = os.path.join(target, '{}.tsv').format -with open(formatter('train'), 'w+') as train, \ - open(formatter('test'), 'w+') as test, \ - open(formatter('dev'), 'w+') as dev: - train.write('token\tlemma\tpos\tmorph\n') - test.write('token\tlemma\tpos\tmorph\n') - dev.write('token\tlemma\tpos\tmorph\n') - for f in files: - lines = list(readlines(os.path.join(root, f))) - if len(lines) < 5000: - writelines(train, lines) - else: - five = int(len(lines) * 0.05) - ten = int(len(lines) * 0.1) - writelines(dev, lines[:ten]) - writelines(test, lines[ten: five + (2*ten)]) - writelines(train, lines[five + (2*ten):]) From 1f0cae8813029fa4c88d720eb9bc02c17c1f539a Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Wed, 8 May 2019 10:08:45 +0200 Subject: [PATCH 06/46] custom lstm fix --- pie/models/lstm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pie/models/lstm.py b/pie/models/lstm.py index ba3c9db..ab5bad1 100644 --- a/pie/models/lstm.py +++ b/pie/models/lstm.py @@ -59,11 +59,10 @@ def __repr__(self): class CustomLSTM(nn.Module): """A module that runs multiple steps of LSTM.""" - def __init__(self, input_size, hidden_size, num_layers=1, dropout=0, **kwargs): + def __init__(self, input_size, hidden_size, num_layers=1, **kwargs): super().__init__() self.input_size = input_size self.hidden_size = hidden_size - self.dropout = dropout self.cell = CustomLSTMCell(input_size=input_size, hidden_size=hidden_size) @staticmethod @@ -106,6 +105,7 @@ def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.0): self.input_size = input_size self.hidden_size = hidden_size self.num_layers = num_layers + self.dropout = dropout super().__init__() layers = [] From 663ef0defbd7da56f9f5a860fb2689bf5fd4691c Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Wed, 8 May 2019 10:11:11 +0200 Subject: [PATCH 07/46] allow env nested keys and minor refactor --- pie/scripts/train.py | 17 +------- pie/settings.py | 97 ++++++++++++++++++++++---------------------- pie/utils.py | 19 +++++++++ 3 files changed, 69 insertions(+), 64 deletions(-) diff --git a/pie/scripts/train.py b/pie/scripts/train.py index 5570a90..84da13c 100644 --- a/pie/scripts/train.py +++ b/pie/scripts/train.py @@ -7,13 +7,13 @@ import pie +from pie import utils from pie.settings import settings_from_file from pie.trainer import Trainer from pie import initialization from pie.data import Dataset, Reader, MultiLabelEncoder from pie.models import SimpleModel, get_pretrained_embeddings -# set seeds import random import numpy import torch @@ -41,21 +41,6 @@ def run(config_path): if torch.cuda.is_available(): torch.cuda.manual_seed(seed) - settings = settings_from_file(config_path) - - # check settings - # - check at least and at most one target - has_target = False - for task in settings.tasks: - if len(settings.tasks) == 1: - task['target'] = True - if task.get('target', False): - if has_target: - raise ValueError("Got more than one target task") - has_target = True - if not has_target: - raise ValueError("Needs at least one target task") - # datasets reader = Reader(settings, settings.input_path) tasks = reader.check_tasks(expected=None) diff --git a/pie/settings.py b/pie/settings.py index 4c67a7b..9fad412 100644 --- a/pie/settings.py +++ b/pie/settings.py @@ -4,6 +4,8 @@ import json from json_minify import json_minify +from pie import utils + DEFAULTPATH = os.sep.join([os.path.dirname(__file__), 'default_settings.json']) @@ -38,38 +40,6 @@ def __delitem__(self, key): del self.__dict__[key] -def flat_merge(s1, s2): - """ - Merge two dictionaries in a flat way (non-recursive) - - >>> flat_merge({"a": {"b": 1}}, {"a": {"c": 2}}) - {'a': {'b': 1}} - """ - for k in s2: - if k not in s1: - s1[k] = s2[k] - - return s1 - - -def recursive_merge(s1, s2): - """ - Recursively merge two dictionaries - - >>> recursive_merge({"a": {"b": 1}}, {"a": {"c": 2}}) - {'a': {'b': 1, 'c': 2}} - """ - for k, v in s2.items(): - if k in s1 and isinstance(v, dict): - if not isinstance(s1[k], dict): - raise ValueError("Expected dictionary at key [{}]".format(k)) - s1[k] = recursive_merge(s1[k], v) - elif k not in s1: - s1[k] = v - - return s1 - - def merge_task_defaults(settings): for task in settings.tasks: task_settings = task.get("settings", {}) @@ -89,6 +59,46 @@ def load_default_settings(): return merge_task_defaults(Settings(json.loads(json_minify(f.read())))) +def parse_key(keys, v, defaults): + """ + >>> parse_key(['a', 'b'], '1', {'a': {'b': 2}}) + {'a': {'b': 1}} + """ + key, *keys = keys + if key not in defaults: + raise ValueError("Unknown key: ", key) + if not keys: + return {key: type(defaults[key])(v)} + return {key: parse_key(keys, v, defaults[key])} + + +def parse_env_settings(defaults): + output = {} + for k, v in os.environ.items(): + if not k.startswith('PIE_'): + continue + keys = k.lower()[4:].split('__') + output = utils.recursive_merge(output, parse_key(keys, v, defaults)) + + return output + + +def check_settings(settings): + # - check at least and at most one target + has_target = False + for task in settings.tasks: + if len(settings.tasks) == 1: + task['target'] = True + if task.get('target', False): + if has_target: + raise ValueError("Got more than one target task") + has_target = True + if not has_target: + raise ValueError("Needs at least one target task") + + return settings + + def settings_from_file(config_path): """Loads and parses a parameter file. @@ -113,21 +123,12 @@ def settings_from_file(config_path): with open(DEFAULTPATH, 'r') as f: defaults = json.loads(json_minify(f.read())) - # settings = Settings(flat_merge(p, defaults)) - settings = Settings(recursive_merge(p, defaults)) - - # ultimately overwrite settings from environ vars of the form PIE_{var} - checked = [] - for k in settings: - env_k = 'PIE_{}'.format(k.upper()) - if env_k in os.environ: - # transform to target type and overwrite settings - settings[k] = type(defaults[k])(os.environ[env_k]) - checked.append(env_k) - for env_k in os.environ: - if env_k.startswith('PIE_') and env_k not in checked: - raise ValueError( - "Environment variable '{}' didn't match. Aborting!".format(env_k)) + settings = Settings( + utils.recursive_merge( + # merge defaults + utils.recursive_merge(p, defaults), + # ultimately overwrite settings from environ vars of the form PIE_{var} + parse_env_settings(defaults), overwrite=True)) # store the config path too: settings.config_path = config_path @@ -136,4 +137,4 @@ def settings_from_file(config_path): print("\n::: Loaded Config :::\n") print(yaml.dump(dict(settings))) - return merge_task_defaults(settings) + return check_settings(merge_task_defaults(settings)) diff --git a/pie/utils.py b/pie/utils.py index c362c8d..a7c9a9b 100644 --- a/pie/utils.py +++ b/pie/utils.py @@ -55,6 +55,25 @@ def flatten(it): yield from flatten(subit) +def recursive_merge(s1, s2, overwrite=False): + """ + Recursively merge two dictionaries + + >>> recursive_merge({"a": {"b": 1}}, {"a": {"c": 2}}) + {'a': {'b': 1, 'c': 2}} + """ + for k, v in s2.items(): + if k in s1 and isinstance(v, dict): + if not isinstance(s1[k], dict): + raise ValueError("Expected dictionary at key [{}]".format(k)) + s1[k] = recursive_merge(s1[k], v, overwrite=overwrite) + else: + if overwrite or k not in s1: + s1[k] = v + + return s1 + + def ensure_ext(path, ext, infix=None): """ Compute target path with eventual infix and extension From 63ca969f898dacc401488eb0b65b8ce7c9f2c0fd Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Wed, 8 May 2019 10:13:52 +0200 Subject: [PATCH 08/46] moved lr scheduler to use pytorch builtin --- pie/default_settings.json | 3 +- pie/trainer.py | 70 +++++++++++++++++++-------------------- 2 files changed, 36 insertions(+), 37 deletions(-) diff --git a/pie/default_settings.json b/pie/default_settings.json index 908c0f2..ca5dac6 100644 --- a/pie/default_settings.json +++ b/pie/default_settings.json @@ -116,7 +116,8 @@ "lr": 0.001, "lr_factor": 0.75, // lr schedule (decrease lr by this factor after `lr_patience` epochs // without improvement on dev-set data) - "lr_patience": 2, // patience for early stopping + "min_lr": 0.000001, // minimum learning rate + "lr_patience": 2, // patience for lr schedule "checks_per_epoch": 1, // check model on dev-set so many times during epoch // * Model hyperparameters diff --git a/pie/trainer.py b/pie/trainer.py index 68648e4..068ae3f 100644 --- a/pie/trainer.py +++ b/pie/trainer.py @@ -32,6 +32,13 @@ def sample_task(target, tasks, factor=2): return random.choices(tasks, weights)[0] +def get_target_task(settings): + for task in settings.tasks: + if task.get('target'): + return task['name'] + raise ValueError("No target task?") + + class EarlyStopException(Exception): def __init__(self, task, loss, state_dict): self.task = task @@ -43,8 +50,7 @@ class TaskScheduler(object): """ Track scores """ - def __init__(self, tasks, patience, factor, threshold, min_weight, - optimizer=None, lr_factor=1, lr_patience=100): + def __init__(self, tasks, patience, factor, threshold, min_weight): for task, values in tasks.items(): tasks[task] = {'steps': 0, **values} # set task mode @@ -59,12 +65,6 @@ def __init__(self, tasks, patience, factor, threshold, min_weight, else: tasks[task]['best'] = float('inf') - # lr schedule - self.optimizer = optimizer - self.lr_factor = lr_factor - self.lr_patience = lr_patience - self.lr_steps = 0 - # task schedule self.tasks = tasks self.patience = patience @@ -73,13 +73,6 @@ def __init__(self, tasks, patience, factor, threshold, min_weight, self.min_weight = min_weight self.fid = '/tmp/{}'.format(str(uuid.uuid1())) - def get_lr(self): - # assumes single param group - return float(self.optimizer.param_groups[0]['lr']) - - def set_lr(self, new_lr): - self.optimizer.param_groups[0]['lr'] = new_lr - def __repr__(self): # task scheduler output = ( @@ -93,12 +86,6 @@ def __repr__(self): output += '/>' output += '\n' - # lr scheduler - if self.optimizer is not None: - output += '\n' - output += ''.format( - round(self.get_lr(), 5), self.lr_steps, self.lr_patience) - return output def is_best(self, task, value): @@ -129,13 +116,8 @@ def step(self, scores, model): if is_target: # serialize model params torch.save(model.state_dict(), self.fid) - # lr schedule - self.lr_steps = 0 else: self.tasks[task]['steps'] += 1 - # lr schedule - if is_target: - self.lr_steps += 1 # check if we need to stop globally or downweight a task loss patience = self.tasks[task].get('patience', self.patience) @@ -152,14 +134,25 @@ def step(self, scores, model): min_weight = self.tasks[task].get('min_weight', self.min_weight) self.tasks[task]['weight'] = max(new_weight, min_weight) - # lr schedule - if is_target and self.lr_steps >= self.lr_patience: - self.set_lr(self.get_lr() * self.lr_factor) - def get_weights(self): return {task: self.tasks[task]['weight'] for task in self.tasks} +class LRScheduler(object): + def __init__(self, optimizer, **kwargs): + self.lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau( + optimizer, mode='max', **kwargs) + + def step(self, score): + self.lr_scheduler.step(score) + + def __repr__(self): + return '' \ + .format(self.lr_scheduler.optimizer.param_groups[0]['lr'], + self.lr_scheduler.num_bad_epochs, + self.lr_scheduler.patience) + + class Trainer(object): """ Trainer @@ -167,7 +160,7 @@ class Trainer(object): Settings ======== optim - lr + lr (lr_factor, lr_patience, min_lr) clip_norm weights report_freq @@ -175,6 +168,7 @@ class Trainer(object): """ def __init__(self, settings, model, dataset, num_instances): self.tasks = settings.tasks + self.target_task = get_target_task(settings) self.verbose = settings.verbose self.dataset = dataset self.model = model @@ -201,12 +195,10 @@ def __init__(self, settings, model, dataset, num_instances): tasks['lm_fwd'] = settings.lm_schedule tasks['lm_bwd'] = settings.lm_schedule self.task_scheduler = TaskScheduler( - # task schedule tasks, settings.patience, settings.factor, settings.threshold, - settings.min_weight, - # lr schedule - optimizer=self.optimizer, - lr_factor=settings.lr_factor, lr_patience=settings.lr_patience) + settings.min_weight) + self.lr_scheduler = LRScheduler(self.optimizer, factor=settings.lr_factor, + patience=settings.lr_patience, min_lr=settings.min_lr) if settings.verbose: print() @@ -217,6 +209,9 @@ def __init__(self, settings, model, dataset, num_instances): print() print(self.task_scheduler) print() + print("::: LR schedule :::") + print(self.lr_scheduler) + print() def weight_loss(self, loss): """ @@ -276,10 +271,13 @@ def run_check(self, devset): dev_scores['lm_bwd'] = dev_loss['lm_bwd'] self.task_scheduler.step(dev_scores, self.model) + self.lr_scheduler.step(dev_scores[self.target_task]) if self.verbose: print(self.task_scheduler) print() + print(self.lr_scheduler) + print() return dev_scores From 7dfb65c89ae7f1275997712edf70c4710a35cc59 Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Wed, 8 May 2019 10:15:19 +0200 Subject: [PATCH 09/46] added optimize script for random hyperparam search --- pie/scripts/optimize.py | 121 ++++++++++++++++++++++++++++++++++++++++ pie/scripts/train.py | 5 +- 2 files changed, 124 insertions(+), 2 deletions(-) create mode 100644 pie/scripts/optimize.py diff --git a/pie/scripts/optimize.py b/pie/scripts/optimize.py new file mode 100644 index 0000000..310084d --- /dev/null +++ b/pie/scripts/optimize.py @@ -0,0 +1,121 @@ + +import random +import json + +import yaml +from json_minify import json_minify +import scipy.stats as stats + +from pie import utils +from pie.settings import settings_from_file, Settings + + +# available distributions +class truncnorm: + def __init__(self, mu, std, lower=0, upper=1): + a, b = (lower - mu) / std, (upper - mu) / std + self.norm = stats.truncnorm(a, b, mu, std) + + def rvs(self): + return float(self.norm.rvs()) + + +class normint: + def __init__(self, mu, std, lower, upper): + self.norm = truncnorm(mu, std, lower, upper) + + def rvs(self): + return int(round(self.norm.rvs())) // 2 * 2 + + +class choice: + def __init__(self, items): + self.items = items + + def rvs(self): + return random.choice(self.items) + + +def parse_opt(obj, opt_key): + """ + Parses the opt file into a (possibly deep) dictionary where the leaves are + ready-to-use distributions + """ + opt = {} + + for param, v in obj.items(): + if isinstance(v, list): + opt[param] = [parse_opt(v_item, opt_key) for v_item in v] + elif isinstance(v, dict): + if opt_key in v: + if v[opt_key] == 'norm': + opt[param] = stats.norm(**v['params']) + elif v[opt_key] == 'truncnorm': + opt[param] = truncnorm(**v['params']) + elif v[opt_key] == 'normint': + opt[param] = normint(**v['params']) + elif v[opt_key] == 'choice': + opt[param] = choice(v['params']) + else: + raise ValueError("Unknown distribution: ", v[opt_key]) + else: + opt[param] = parse_opt(v, opt_key) + + return opt + + +def read_opt(path, opt_key='opt'): + """ + Reads and parses the opt file (as per parse_opt) + """ + with open(path) as f: + obj = json.loads(json_minify(f.read())) + + return parse_opt(obj, opt_key) + + +def sample_from_config(opt): + """ + Applies the distributions specified in the opt.json file + """ + output = {} + + for param, dist in opt.items(): + if isinstance(dist, dict): + output[param] = sample_from_config(dist) + elif isinstance(dist, list): + output[param] = [sample_from_config(d) for d in dist] + else: + output[param] = dist.rvs() + + return output + + +def run(config, opt, n_iter): + import train + + for i in range(n_iter): + print() + print("::: Starting optimization run {} :::".format(i + 1)) + print() + sampled_config = sample_from_config(opt) + merged = utils.recursive_merge(dict(config), sampled_config, overwrite=True) + print(yaml.dump(dict(config))) + print(yaml.dump(merged)) + train.run(Settings(merged)) + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('config_path', default='config.json') + parser.add_argument('opt_path') + parser.add_argument('--n_iter', type=int, default=20) + args = parser.parse_args() + + with utils.shutup(): + config = settings_from_file(args.config_path) + + opt = read_opt(args.opt_path) + + run(config, opt, args.n_iter) diff --git a/pie/scripts/train.py b/pie/scripts/train.py index 84da13c..e6d6f23 100644 --- a/pie/scripts/train.py +++ b/pie/scripts/train.py @@ -31,7 +31,8 @@ def get_fname_infix(settings): return fname, infix -def run(config_path): +def run(settings): + # seeding now = datetime.now() seed = now.hour * 10000 + now.minute * 100 + now.second print("Using seed:", seed) @@ -187,4 +188,4 @@ def run(config_path): parser = argparse.ArgumentParser() parser.add_argument('config_path', nargs='?', default='config.json') args = parser.parse_args() - run(config_path=args.config_path) + run(settings_from_file(args.config_path)) From 4d3d1dd8c32fa76a7151f9efca9d91f5d524b49c Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Wed, 8 May 2019 10:15:46 +0200 Subject: [PATCH 10/46] minor refactor --- pie/default_settings.json | 4 ++-- pie/scripts/train.py | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pie/default_settings.json b/pie/default_settings.json index ca5dac6..09a71fe 100644 --- a/pie/default_settings.json +++ b/pie/default_settings.json @@ -82,7 +82,7 @@ }, // general task schedule params (can be overwritten in the "settings" entry of each) - "patience": 1000000, // default task schedule patience + "patience": 100, // task patience (global early stopping patience for target task) "factor": 1, // default task schedule factor "threshold": 0, // default task schedule thresholed "min_weight": 0, // default task schedule min_weight @@ -92,7 +92,7 @@ "lm_shared_softmax": false, // whether to share the output layer for both fwd and bwd lm "lm_schedule": { // settings for joint LM task in case `include_lm` is true - "patience": 100, "factor": 0.5, "weight": 0.2, "mode": "min" + "patience": 2, "factor": 0.5, "weight": 0.2, "mode": "min" }, // * Training diff --git a/pie/scripts/train.py b/pie/scripts/train.py index e6d6f23..a577958 100644 --- a/pie/scripts/train.py +++ b/pie/scripts/train.py @@ -171,8 +171,10 @@ def run(settings): scorer = scorers[task] result = scorer.get_scores() for acc in result: - scores.append('{}:{:.6f}'.format(task, result[acc]['accuracy'])) - scores.append('{}-support:{}'.format(task, result[acc]['support'])) + scores.append('{}-{}:{:.6f}'.format( + acc, task, result[acc]['accuracy'])) + scores.append('{}-{}-support:{}'.format( + acc, task, result[acc]['support'])) path = '{}.results.{}.csv'.format( settings.modelname, '-'.join(get_targets(settings))) with open(path, 'a') as f: From d2a0ff85198e867d82f409dc77e5191ba7c3002f Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Wed, 8 May 2019 10:15:56 +0200 Subject: [PATCH 11/46] added inspect model script --- pie/scripts/inspect_model.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 pie/scripts/inspect_model.py diff --git a/pie/scripts/inspect_model.py b/pie/scripts/inspect_model.py new file mode 100644 index 0000000..6c95941 --- /dev/null +++ b/pie/scripts/inspect_model.py @@ -0,0 +1,17 @@ + +import pie +import yaml + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('modelpath') + args = parser.parse_args() + + m = pie.SimpleModel.load(args.modelpath) + print("::: Settings :::") + print(yaml.dump(dict(m._settings))) + print() + print("::: Model :::") + print(m) + From 71d115a97dba8b4100af7029236334b3055f5aaf Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Wed, 8 May 2019 13:18:43 +0200 Subject: [PATCH 12/46] serializing without tmp files --- pie/default_settings.json | 2 +- pie/models/base_model.py | 11 ++++------- pie/models/model.py | 16 ++++++++++++---- pie/utils.py | 11 +++++++++++ 4 files changed, 28 insertions(+), 12 deletions(-) diff --git a/pie/default_settings.json b/pie/default_settings.json index 09a71fe..f43b651 100644 --- a/pie/default_settings.json +++ b/pie/default_settings.json @@ -89,7 +89,7 @@ // * Joint LM-loss "include_lm": false, // whether to include autoregressive loss - "lm_shared_softmax": false, // whether to share the output layer for both fwd and bwd lm + "lm_shared_softmax": true, // whether to share the output layer for both fwd and bwd lm "lm_schedule": { // settings for joint LM task in case `include_lm` is true "patience": 2, "factor": 0.5, "weight": 0.2, "mode": "min" diff --git a/pie/models/base_model.py b/pie/models/base_model.py index 48270a1..ff20abd 100644 --- a/pie/models/base_model.py +++ b/pie/models/base_model.py @@ -123,9 +123,7 @@ def save(self, fpath, infix=None, settings=None): utils.add_gzip_to_tar(string, path, tar) # serialize weights - with utils.tmpfile() as tmppath: - torch.save(self.state_dict(), tmppath) - tar.add(tmppath, arcname='state_dict.pt') + utils.add_weights_to_tar(self.state_dict(), 'state_dict.pt', tar) # serialize current pie commit if pie.__commit__ is not None: @@ -191,10 +189,9 @@ def load(fpath): logging.warn("Couldn't load settings for model {}!".format(fpath)) # load state_dict - with utils.tmpfile() as tmppath: - tar.extract('state_dict.pt', path=tmppath) - dictpath = os.path.join(tmppath, 'state_dict.pt') - model.load_state_dict(torch.load(dictpath, map_location='cpu')) + model.load_state_dict( + torch.load(tar.extractfile('state_dict.pt'), + map_location='cpu')) model.eval() diff --git a/pie/models/model.py b/pie/models/model.py index 8833666..120f6ac 100644 --- a/pie/models/model.py +++ b/pie/models/model.py @@ -39,10 +39,18 @@ class SimpleModel(BaseModel): cemb_type : str, one of "RNN", "CNN", layer to use for char-level embeddings """ def __init__(self, label_encoder, tasks, wemb_dim, cemb_dim, hidden_size, num_layers, - dropout=0.0, word_dropout=0.0, merge_type='concat', cemb_type='RNN', - cemb_layers=1, cell='LSTM', custom_cemb_cell=False, scorer='general', - include_lm=True, lm_shared_softmax=True, init_rnn='xavier_uniform', - linear_layers=1, **kwargs): + cell='LSTM', init_rnn='xavier_uniform', + # dropout + dropout=0.0, word_dropout=0.0, + # word embeddings + merge_type='concat', cemb_type='RNN', cemb_layers=1, + custom_cemb_cell=False, scorer='general', + # lm joint loss + include_lm=True, lm_shared_softmax=True, + # decoder + linear_layers=1, + # kwargs + **kwargs): # args self.wemb_dim = wemb_dim self.cemb_dim = cemb_dim diff --git a/pie/utils.py b/pie/utils.py index a7c9a9b..a4af0d9 100644 --- a/pie/utils.py +++ b/pie/utils.py @@ -1,9 +1,11 @@ import re +import io import os import shutil import uuid import gzip +import tarfile import logging import sys import glob @@ -144,6 +146,15 @@ def tmpfile(parent='/tmp/'): os.remove(tmppath) +def add_weights_to_tar(state_dict, path, tar): + f = io.BytesIO() + torch.save(state_dict, f) + tinf = tarfile.TarInfo(name=path) + f.seek(0) + tinf.size = len(f.getbuffer()) + tar.addfile(tinf, f) # read tinf.size bytes from f into tinf + + def add_gzip_to_tar(string, subpath, tar): with tmpfile() as tmppath: with gzip.GzipFile(tmppath, 'w') as f: From 0c134c27166bf11880121ca91bfa8fa4b09def8d Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Wed, 8 May 2019 13:18:43 +0200 Subject: [PATCH 13/46] serializing without tmp files --- pie/default_settings.json | 2 +- pie/models/base_model.py | 11 ++++------- pie/models/model.py | 16 ++++++++++++---- pie/utils.py | 12 ++++++++++++ 4 files changed, 29 insertions(+), 12 deletions(-) diff --git a/pie/default_settings.json b/pie/default_settings.json index 09a71fe..f43b651 100644 --- a/pie/default_settings.json +++ b/pie/default_settings.json @@ -89,7 +89,7 @@ // * Joint LM-loss "include_lm": false, // whether to include autoregressive loss - "lm_shared_softmax": false, // whether to share the output layer for both fwd and bwd lm + "lm_shared_softmax": true, // whether to share the output layer for both fwd and bwd lm "lm_schedule": { // settings for joint LM task in case `include_lm` is true "patience": 2, "factor": 0.5, "weight": 0.2, "mode": "min" diff --git a/pie/models/base_model.py b/pie/models/base_model.py index 48270a1..ff20abd 100644 --- a/pie/models/base_model.py +++ b/pie/models/base_model.py @@ -123,9 +123,7 @@ def save(self, fpath, infix=None, settings=None): utils.add_gzip_to_tar(string, path, tar) # serialize weights - with utils.tmpfile() as tmppath: - torch.save(self.state_dict(), tmppath) - tar.add(tmppath, arcname='state_dict.pt') + utils.add_weights_to_tar(self.state_dict(), 'state_dict.pt', tar) # serialize current pie commit if pie.__commit__ is not None: @@ -191,10 +189,9 @@ def load(fpath): logging.warn("Couldn't load settings for model {}!".format(fpath)) # load state_dict - with utils.tmpfile() as tmppath: - tar.extract('state_dict.pt', path=tmppath) - dictpath = os.path.join(tmppath, 'state_dict.pt') - model.load_state_dict(torch.load(dictpath, map_location='cpu')) + model.load_state_dict( + torch.load(tar.extractfile('state_dict.pt'), + map_location='cpu')) model.eval() diff --git a/pie/models/model.py b/pie/models/model.py index 8833666..120f6ac 100644 --- a/pie/models/model.py +++ b/pie/models/model.py @@ -39,10 +39,18 @@ class SimpleModel(BaseModel): cemb_type : str, one of "RNN", "CNN", layer to use for char-level embeddings """ def __init__(self, label_encoder, tasks, wemb_dim, cemb_dim, hidden_size, num_layers, - dropout=0.0, word_dropout=0.0, merge_type='concat', cemb_type='RNN', - cemb_layers=1, cell='LSTM', custom_cemb_cell=False, scorer='general', - include_lm=True, lm_shared_softmax=True, init_rnn='xavier_uniform', - linear_layers=1, **kwargs): + cell='LSTM', init_rnn='xavier_uniform', + # dropout + dropout=0.0, word_dropout=0.0, + # word embeddings + merge_type='concat', cemb_type='RNN', cemb_layers=1, + custom_cemb_cell=False, scorer='general', + # lm joint loss + include_lm=True, lm_shared_softmax=True, + # decoder + linear_layers=1, + # kwargs + **kwargs): # args self.wemb_dim = wemb_dim self.cemb_dim = cemb_dim diff --git a/pie/utils.py b/pie/utils.py index a7c9a9b..0f7f91a 100644 --- a/pie/utils.py +++ b/pie/utils.py @@ -1,9 +1,11 @@ import re +import io import os import shutil import uuid import gzip +import tarfile import logging import sys import glob @@ -144,6 +146,16 @@ def tmpfile(parent='/tmp/'): os.remove(tmppath) +def add_weights_to_tar(state_dict, path, tar): + import torch + f = io.BytesIO() + torch.save(state_dict, f) + tinf = tarfile.TarInfo(name=path) + f.seek(0) + tinf.size = len(f.getbuffer()) + tar.addfile(tinf, f) # read tinf.size bytes from f into tinf + + def add_gzip_to_tar(string, subpath, tar): with tmpfile() as tmppath: with gzip.GzipFile(tmppath, 'w') as f: From c7fe11163e71c8bb5aedc5e18e9ea5bdfb9d69bc Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Wed, 8 May 2019 15:16:31 +0200 Subject: [PATCH 14/46] refactor scheduler init --- pie/trainer.py | 62 ++++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/pie/trainer.py b/pie/trainer.py index 068ae3f..bc7a219 100644 --- a/pie/trainer.py +++ b/pie/trainer.py @@ -50,27 +50,38 @@ class TaskScheduler(object): """ Track scores """ - def __init__(self, tasks, patience, factor, threshold, min_weight): - for task, values in tasks.items(): - tasks[task] = {'steps': 0, **values} - # set task mode - if 'mode' not in tasks[task]: - tasks[task]['mode'] = 'max' + def __init__(self, settings): + tasks = {} + # preprocess tasks + for task in settings.tasks: + # ignore read-only + if task.get('read_only'): + continue + # add schedule and target + tasks[task['name']] = task.get('schedule', {}) + tasks[task['name']]['target'] = task.get('target', False) + # add task data for lm loss + if settings.include_lm: + tasks['lm_fwd'] = settings.lm_schedule + tasks['lm_bwd'] = settings.lm_schedule + + for task, tdata in tasks.items(): + # set step counter + tdata['step'] = 0 + # set default task mode + tdata['mode'] = tdata.get('mode', 'max') # set initial weight - if 'weight' not in tasks[task]: - tasks[task]['weight'] = 1.0 + tdata['weight'] = tdata.get('weight', 1.0) # set initial best - if tasks[task]['mode'] == 'max': - tasks[task]['best'] = -float('inf') - else: - tasks[task]['best'] = float('inf') + tdata['best'] = -float('inf') if tdata['mode'] == 'max' else float('inf') - # task schedule self.tasks = tasks - self.patience = patience - self.factor = factor - self.threshold = threshold - self.min_weight = min_weight + + # task schedule + self.patience = settings.patience + self.factor = settings.factor + self.threshold = settings.threshold + self.min_weight = settings.min_weight self.fid = '/tmp/{}'.format(str(uuid.uuid1())) def __repr__(self): @@ -167,7 +178,6 @@ class Trainer(object): checks_per_epoch """ def __init__(self, settings, model, dataset, num_instances): - self.tasks = settings.tasks self.target_task = get_target_task(settings) self.verbose = settings.verbose self.dataset = dataset @@ -187,17 +197,9 @@ def __init__(self, settings, model, dataset, num_instances): else: self.check_freq = 0 # no checks - tasks = {} - for task in settings.tasks: - tasks[task['name']] = task.get('schedule', {}) - tasks[task['name']]['target'] = task.get('target', False) - if settings.include_lm: - tasks['lm_fwd'] = settings.lm_schedule - tasks['lm_bwd'] = settings.lm_schedule - self.task_scheduler = TaskScheduler( - tasks, settings.patience, settings.factor, settings.threshold, - settings.min_weight) - self.lr_scheduler = LRScheduler(self.optimizer, factor=settings.lr_factor, + self.task_scheduler = TaskScheduler(settings) + self.lr_scheduler = LRScheduler( + self.optimizer, factor=settings.lr_factor, patience=settings.lr_patience, min_lr=settings.min_lr) if settings.verbose: @@ -289,7 +291,7 @@ def train_epoch(self, devset, epoch): for b, batch in enumerate(self.dataset.batch_generator()): # get loss - loss = self.model.loss(batch, get_batch_task(self.tasks)) + loss = self.model.loss(batch, get_batch_task(self.model.tasks.values())) if not loss: raise ValueError("Got empty loss, no tasks defined?") From eb6e6f68771bba08c34efcef9637f7d1ef9f8590 Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Wed, 8 May 2019 15:17:17 +0200 Subject: [PATCH 15/46] read-only tasks --- pie/default_settings.json | 8 ++++++-- pie/models/base_model.py | 7 +++---- pie/models/model.py | 2 +- pie/settings.py | 2 ++ 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/pie/default_settings.json b/pie/default_settings.json index f43b651..a13be5e 100644 --- a/pie/default_settings.json +++ b/pie/default_settings.json @@ -69,16 +69,20 @@ "patience": 2, "threshold": 0.001 }, - "default": "copy" // while processing the files if the field is missing predict + "default": "copy", // while processing the files if the field is missing predict // the input token or something else: // - "copy" for copy over the token form // - "UNK" predict "UNK" + "read_only": false // encode task but don't model it } ], // task defaults for any given auxiliary task (can be overwritten by a task definition) "task_defaults": { - "level": "token", "layer": -1, "decoder": "linear", "context": "sentence" + "level": "token", + "layer": -1, + "decoder": "linear", + "context": "sentence" }, // general task schedule params (can be overwritten in the "settings" entry of each) diff --git a/pie/models/base_model.py b/pie/models/base_model.py index ff20abd..ea37b04 100644 --- a/pie/models/base_model.py +++ b/pie/models/base_model.py @@ -27,9 +27,7 @@ class BaseModel(nn.Module): def __init__(self, label_encoder, tasks, *args, **kwargs): self.label_encoder = label_encoder # prepare input task data from task settings - if isinstance(tasks, list): - tasks = {task['name']: task for task in tasks} - self.tasks = tasks + self.tasks = {task['name']: task for task in tasks if not task.get('read_only')} super().__init__() def loss(self, batch_data): @@ -62,7 +60,8 @@ def evaluate(self, dataset, trainset=None, **kwargs): assert not self.training, "Ooops! Inference in training mode. Call model.eval()" scorers = {} - for task, le in self.label_encoder.tasks.items(): + for task in self.tasks: + le = self.label_encoder.tasks[task] scorers[task] = Scorer(le, trainset) with torch.no_grad(): diff --git a/pie/models/model.py b/pie/models/model.py index 120f6ac..0adff37 100644 --- a/pie/models/model.py +++ b/pie/models/model.py @@ -309,7 +309,7 @@ def loss(self, batch_data, *target_tasks): return output def predict(self, inp, *tasks, use_beam=False, beam_width=10, **kwargs): - tasks = set(self.label_encoder.tasks if not len(tasks) else tasks) + tasks = set(self.tasks if not len(tasks) else tasks) preds = {} (word, wlen), (char, clen) = inp diff --git a/pie/settings.py b/pie/settings.py index 9fad412..9c44921 100644 --- a/pie/settings.py +++ b/pie/settings.py @@ -90,6 +90,8 @@ def check_settings(settings): if len(settings.tasks) == 1: task['target'] = True if task.get('target', False): + if task.get('read_only'): + raise ValueError("Target task cannot be 'read_only'") if has_target: raise ValueError("Got more than one target task") has_target = True From 94a45d2ab74b86a7a405127a332fddf37ea56bdd Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Thu, 9 May 2019 17:07:43 +0200 Subject: [PATCH 16/46] fixes and refactors --- pie/data/dataset.py | 5 ++--- pie/models/base_model.py | 8 +++++--- pie/models/decoder.py | 43 ++++++++++++++++++++-------------------- pie/models/embedding.py | 26 +----------------------- pie/models/highway.py | 39 ++++++++++++++++++++++++++++++++++++ pie/scripts/evaluate.py | 2 +- pie/scripts/optimize.py | 21 ++++++++++++-------- pie/scripts/train.py | 6 +++++- pie/trainer.py | 2 +- 9 files changed, 89 insertions(+), 63 deletions(-) create mode 100644 pie/models/highway.py diff --git a/pie/data/dataset.py b/pie/data/dataset.py index b2ce1f0..800167b 100644 --- a/pie/data/dataset.py +++ b/pie/data/dataset.py @@ -303,9 +303,8 @@ def from_settings(cls, settings, tasks=None): for task in settings.tasks: if tasks is not None and task['settings']['target'] not in tasks: - logging.warning( - "Ignoring task [{}]: no available data".format(task['target'])) - continue + raise ValueError("No available data for task [{}]".format( + task['settings']['target'])) le.add_task(task['name'], level=task['level'], **task['settings']) return le diff --git a/pie/models/base_model.py b/pie/models/base_model.py index ea37b04..3f5f536 100644 --- a/pie/models/base_model.py +++ b/pie/models/base_model.py @@ -27,7 +27,10 @@ class BaseModel(nn.Module): def __init__(self, label_encoder, tasks, *args, **kwargs): self.label_encoder = label_encoder # prepare input task data from task settings - self.tasks = {task['name']: task for task in tasks if not task.get('read_only')} + if isinstance(tasks, list): + tasks = {task['name']: task for task in tasks} + # drop read-only tasks + self.tasks = {t: task for t, task in tasks.items() if not task.get('read_only')} super().__init__() def loss(self, batch_data): @@ -189,8 +192,7 @@ def load(fpath): # load state_dict model.load_state_dict( - torch.load(tar.extractfile('state_dict.pt'), - map_location='cpu')) + torch.load(tar.extractfile('state_dict.pt'), map_location='cpu')) model.eval() diff --git a/pie/models/decoder.py b/pie/models/decoder.py index fa0be77..3bd4e72 100644 --- a/pie/models/decoder.py +++ b/pie/models/decoder.py @@ -10,37 +10,38 @@ from .beam_search import Beam from .attention import Attention +from .highway import Highway -class Highway(nn.Module): +class ConditionEmbedding(nn.Module): """ - Highway network + Embed tags and project onto a fixed-size tag embedding """ - def __init__(self, in_features, num_layers, act='relu'): - self.in_features = in_features - - self.act = act + def __init__(self, label_encoders, emb_dim, out_features, dropout=0): + self.dropout = dropout super().__init__() - self.layers = nn.ModuleList( - [nn.Linear(in_features, in_features*2) for _ in range(num_layers)]) + self.embs = nn.ModuleDict({ + le.name: nn.Embedding(len(le), emb_dim, padding_idx=le.get_pad()) + for le in label_encoders}) + self.proj = nn.Linear(len(label_encoders) * emb_dim, out_features) self.init() def init(self): - for layer in self.layers: - initialization.init_linear(layer) - # bias gate to let information go untouched - nn.init.constant_(layer.bias[self.in_features:], 1.) + for emb in self.embs.values(): + initialization.init_embeddings(emb) + initialization.init_linear(self.proj) - def forward(self, inp): - current = inp - for layer in self.layers: - inp, gate = layer(current).chunk(2, dim=-1) - inp, gate = getattr(F, self.act)(inp), F.sigmoid(gate) - current = gate * current + (1 - gate) * inp + def forward(self, **conds): + """t (seq_len x batch) or (batch), tlen""" + embs = torch.cat( + [emb(conds[name]) for name, emb in sorted(self.embs.items())], + dim=-1) - return current + embs = F.dropout(embs, p=self.dropout, training=self.training) + + return self.proj(embs) class LinearDecoder(nn.Module): @@ -131,7 +132,7 @@ def init(self): nn.init.normal_(self.end_transition) def forward(self, enc_outs): - "get logits of the input features" + """get logits of the input features""" # (seq_len x batch x vocab) if self.highway is not None: enc_outs = self.highway(enc_outs) @@ -211,7 +212,7 @@ def loss(self, logits, targets, lengths): def predict(self, enc_outs, lengths): # (seq_len x batch x vocab) - logits = self.projection(enc_outs) + logits = self(enc_outs) seq_len, _, vocab = logits.size() start_tag, end_tag = vocab, vocab + 1 diff --git a/pie/models/embedding.py b/pie/models/embedding.py index d9978ac..d47891d 100644 --- a/pie/models/embedding.py +++ b/pie/models/embedding.py @@ -7,31 +7,7 @@ from pie import initialization from .lstm import CustomBiLSTM - - -class Highway(torch.nn.Module): - def __init__(self, input_dim, num_layers=1, activation=torch.nn.functional.relu): - super(Highway, self).__init__() - - self.layers = torch.nn.ModuleList( - [torch.nn.Linear(input_dim, input_dim * 2) for _ in range(num_layers)]) - self.activation = activation - - for layer in self.layers: - layer.bias[input_dim:].data.fill_(1) - - def forward(self, inputs): - current_input = inputs - - for layer in self.layers: - projected_input = layer(current_input) - linear_part = current_input - nonlinear_part, gate = projected_input.chunk(2, dim=-1) - nonlinear_part = self.activation(nonlinear_part) - gate = torch.sigmoid(gate) - current_input = gate * linear_part + (1 - gate) * nonlinear_part - - return current_input +from .highway import Highway class CNNEmbedding(nn.Module): diff --git a/pie/models/highway.py b/pie/models/highway.py new file mode 100644 index 0000000..e74db14 --- /dev/null +++ b/pie/models/highway.py @@ -0,0 +1,39 @@ + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from pie import initialization + + +class Highway(nn.Module): + """ + Highway network + """ + def __init__(self, in_features, num_layers, act='relu'): + self.in_features = in_features + + self.act = act + super().__init__() + + self.layers = nn.ModuleList( + [nn.Linear(in_features, in_features*2) for _ in range(num_layers)]) + + self.init() + + def init(self): + for layer in self.layers: + initialization.init_linear(layer) + # bias gate to let information go untouched + nn.init.constant_(layer.bias[self.in_features:], 1.) + + def forward(self, inp): + current = inp + for layer in self.layers: + inp, gate = layer(current).chunk(2, dim=-1) + inp, gate = getattr(F, self.act)(inp), F.sigmoid(gate) + current = gate * current + (1 - gate) * inp + + return current + + diff --git a/pie/scripts/evaluate.py b/pie/scripts/evaluate.py index e410f4c..586a86e 100644 --- a/pie/scripts/evaluate.py +++ b/pie/scripts/evaluate.py @@ -13,7 +13,6 @@ def run(model_path, test_path, train_path, model = BaseModel.load(model_path).to(device) if model_info: print(model) - if hasattr(model, '_settings'): # new models should all have _settings settings = model._settings elif settings: @@ -27,6 +26,7 @@ def run(model_path, test_path, train_path, settings.batch_size = batch_size settings.buffer_size = buffer_size settings.device = device + settings.shuffle = False # avoid shuffling trainset = None if train_path: diff --git a/pie/scripts/optimize.py b/pie/scripts/optimize.py index 310084d..17356ec 100644 --- a/pie/scripts/optimize.py +++ b/pie/scripts/optimize.py @@ -7,7 +7,7 @@ import scipy.stats as stats from pie import utils -from pie.settings import settings_from_file, Settings +from pie import settings # available distributions @@ -60,6 +60,8 @@ def parse_opt(obj, opt_key): raise ValueError("Unknown distribution: ", v[opt_key]) else: opt[param] = parse_opt(v, opt_key) + else: + opt[param] = v return opt @@ -85,6 +87,8 @@ def sample_from_config(opt): output[param] = sample_from_config(dist) elif isinstance(dist, list): output[param] = [sample_from_config(d) for d in dist] + elif isinstance(dist, (str, float, int, bool)): + output[param] = dist # no sampling else: output[param] = dist.rvs() @@ -98,23 +102,24 @@ def run(config, opt, n_iter): print() print("::: Starting optimization run {} :::".format(i + 1)) print() - sampled_config = sample_from_config(opt) - merged = utils.recursive_merge(dict(config), sampled_config, overwrite=True) - print(yaml.dump(dict(config))) - print(yaml.dump(merged)) - train.run(Settings(merged)) + sampled = sample_from_config(opt) + merged = settings.Settings( + utils.recursive_merge(dict(config), sampled, overwrite=True)) + print("::: Sampled config :::") + print(yaml.dump(dict(merged))) + train.run(settings.check_settings(settings.merge_task_defaults(merged))) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument('config_path', default='config.json') - parser.add_argument('opt_path') + parser.add_argument('opt_path', help='Path to optimization file (see opt.json)') parser.add_argument('--n_iter', type=int, default=20) args = parser.parse_args() with utils.shutup(): - config = settings_from_file(args.config_path) + config = settings.settings_from_file(args.config_path) opt = read_opt(args.opt_path) diff --git a/pie/scripts/train.py b/pie/scripts/train.py index a577958..054fc2d 100644 --- a/pie/scripts/train.py +++ b/pie/scripts/train.py @@ -32,6 +32,10 @@ def get_fname_infix(settings): def run(settings): + # read settings if input is path + if isinstance(settings, str): + settings = settings_from_file(settings) + # seeding now = datetime.now() seed = now.hour * 10000 + now.minute * 100 + now.second @@ -190,4 +194,4 @@ def run(settings): parser = argparse.ArgumentParser() parser.add_argument('config_path', nargs='?', default='config.json') args = parser.parse_args() - run(settings_from_file(args.config_path)) + run(args.config_path) diff --git a/pie/trainer.py b/pie/trainer.py index bc7a219..5118f7e 100644 --- a/pie/trainer.py +++ b/pie/trainer.py @@ -230,7 +230,7 @@ def evaluate(self, dataset): total_losses, total_batches = collections.defaultdict(float), 0 # get all tasks - tasks = list(self.model.label_encoder.tasks) + tasks = list(self.model.tasks) for batch in tqdm.tqdm(dataset.batch_generator()): total_batches += 1 From 98502fd06a5026663696b48f8ce9f31c9368d63a Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Thu, 9 May 2019 17:10:06 +0200 Subject: [PATCH 17/46] added morph condition embeddings --- pie/default_settings.json | 3 ++ pie/models/base_model.py | 2 +- pie/models/decoder.py | 80 ++++++++++++++++++++++++++++++--------- pie/models/model.py | 66 +++++++++++++++++++++----------- pie/scripts/train.py | 26 +++++++------ pie/settings.py | 13 ++++++- 6 files changed, 137 insertions(+), 53 deletions(-) diff --git a/pie/default_settings.json b/pie/default_settings.json index a13be5e..e2cef7c 100644 --- a/pie/default_settings.json +++ b/pie/default_settings.json @@ -46,6 +46,7 @@ "decoder": "attentional", // type of the decoder (linear, attentional, crf) "context": "none", // (only for char-level) add sentential context // (none, word, sentence, both) + "conditions": [], // name of other tasks that this task will be conditioned on "layer": -1, // define at what sentence encoder layer we do this task "settings": { // encoder settings ("max_size", "min_freq", "preprocessor", "eos", "bos") @@ -133,6 +134,8 @@ "merge_type": "concat", // how to merge word-level and char-level embs (mixer or concat) "scorer": "general", // attention type (one of "general", "dot" and "bahdanau") "linear_layers": 1, // number of layers for linear decoders + "cond_emb_dim": 64, // embedding dimension of each conditioning tag embeddings + "cond_out_dim": 64, // output embedding dimension of resulting general tag embedding "hidden_size": 300, // sentence encoding dimension "num_layers": 1, // num recurrent layers for the sentence encoder "cell": "LSTM" // cell type for rnns diff --git a/pie/models/base_model.py b/pie/models/base_model.py index 3f5f536..6a8b174 100644 --- a/pie/models/base_model.py +++ b/pie/models/base_model.py @@ -71,7 +71,7 @@ def evaluate(self, dataset, trainset=None, **kwargs): for (inp, tasks), (rinp, rtasks) in tqdm.tqdm( dataset.batch_generator(return_raw=True)): - preds = self.predict(inp, **kwargs) + preds = self.predict(inp, conds=tasks, **kwargs) # - get input tokens tokens = [w for line in rinp for w in line] diff --git a/pie/models/decoder.py b/pie/models/decoder.py index 3bd4e72..855bbcf 100644 --- a/pie/models/decoder.py +++ b/pie/models/decoder.py @@ -54,7 +54,9 @@ class LinearDecoder(nn.Module): label_encoder : LabelEncoder in_features : int, input dimension """ - def __init__(self, label_encoder, in_features, highway_layers=0, highway_act='relu'): + def __init__(self, label_encoder, in_features, dropout=0.0, + cond_label_encoders=(), cond_emb_dim=64, cond_out_dim=64, + highway_layers=0, highway_act='relu'): self.label_encoder = label_encoder super().__init__() @@ -66,17 +68,25 @@ def __init__(self, label_encoder, in_features, highway_layers=0, highway_act='re self.highway = None if highway_layers > 0: self.highway = Highway(in_features, highway_layers, highway_act) + # conds + self.conds = {} + if cond_label_encoders: + self.conds = ConditionEmbedding( + cond_label_encoders, cond_emb_dim, cond_out_dim, dropout=dropout) # decoder output - self.decoder = nn.Linear(in_features, len(label_encoder)) + self.decoder = nn.Linear( + in_features + bool(self.conds) * cond_out_dim, len(label_encoder)) self.init() def init(self): # linear initialization.init_linear(self.decoder) - def forward(self, enc_outs): + def forward(self, enc_outs, **conds): if self.highway is not None: enc_outs = self.highway(enc_outs) + if self.conds: + enc_outs = torch.cat([enc_outs, self.conds(**conds)], -1) linear_out = self.decoder(enc_outs) return linear_out @@ -89,18 +99,19 @@ def loss(self, logits, targets): return loss - def predict(self, enc_outs, lengths): + def predict(self, enc_outs, lengths, **conds): """ Parameters ========== enc_outs : torch.tensor(seq_len x batch x hidden_size) """ - probs = F.softmax(self.decoder(enc_outs), dim=-1) + probs = F.softmax(self(enc_outs, **conds), dim=-1) probs, preds = torch.max(probs.transpose(0, 1), dim=-1) output_probs, output_preds = [], [] for idx, length in enumerate(lengths.tolist()): - output_preds.append(self.label_encoder.inverse_transform(preds[idx])[:length]) + output_preds.append( + self.label_encoder.inverse_transform(preds[idx])[:length]) output_probs.append(probs[idx].tolist()) return output_preds, output_probs @@ -254,9 +265,16 @@ class AttentionalDecoder(nn.Module): hidden_size : int, hidden size of the encoder, decoder and attention modules. context_dim : int (optional), dimensionality of additional context vectors """ - def __init__(self, label_encoder, in_dim, hidden_size, scorer='general', - context_dim=0, dropout=0.0, num_layers=1, cell='LSTM', - init_rnn='default'): + def __init__(self, label_encoder, in_dim, hidden_size, dropout=0.0, + # rnn + num_layers=1, cell='LSTM', init_rnn='default', + # attention + scorer='general', + # sentence context + context_dim=0, + # conditions + cond_label_encoders=(), cond_emb_dim=64, cond_out_dim=64): + self.label_encoder = label_encoder self.context_dim = context_dim self.num_layers = num_layers @@ -267,13 +285,22 @@ def __init__(self, label_encoder, in_dim, hidden_size, scorer='general', if label_encoder.get_eos() is None or label_encoder.get_bos() is None: raise ValueError("AttentionalDecoder needs and ") + # nll weight nll_weight = torch.ones(len(label_encoder)) nll_weight[label_encoder.get_pad()] = 0. self.register_buffer('nll_weight', nll_weight) + # emb self.embs = nn.Embedding(len(label_encoder), in_dim) - self.rnn = getattr(nn, cell)(in_dim + context_dim, hidden_size, - num_layers=num_layers, - dropout=dropout if num_layers > 1 else 0) + # conds + self.conds = {} + if cond_label_encoders: + self.conds = ConditionEmbedding( + cond_label_encoders, cond_emb_dim, cond_out_dim, dropout=dropout) + # rnn + self.rnn = getattr(nn, cell)( + in_dim + context_dim + bool(self.conds) * cond_out_dim, hidden_size, + num_layers=num_layers, + dropout=dropout if num_layers > 1 else 0) self.attn = Attention(hidden_size) self.proj = nn.Linear(hidden_size, len(label_encoder)) @@ -287,7 +314,7 @@ def init(self): # linear initialization.init_linear(self.proj) - def forward(self, targets, lengths, enc_outs, src_lengths, context=None): + def forward(self, targets, lengths, enc_outs, src_lengths, context=None, **conds): """ Decoding routine for training. Returns the logits corresponding to the targets for the `loss` method. Takes care of padding. @@ -295,6 +322,14 @@ def forward(self, targets, lengths, enc_outs, src_lengths, context=None): targets, lengths = targets[:-1], lengths - 1 embs = self.embs(targets) + if self.conds: + # each cond should be (batch) already flattened + # (seq_len x batch x emb_dim) + (batch x cond_dim) + embs = torch.cat( + [embs, + self.conds(**conds).unsqueeze(0).repeat(embs.size(0), 1, 1)], + dim=2) + if self.context_dim > 0: if context is None: raise ValueError("Contextual Decoder needs `context`") @@ -333,8 +368,9 @@ def loss(self, logits, targets): return loss - def predict_max(self, enc_outs, lengths, context=None, max_seq_len=20, - bos=None, eos=None): + def predict_max(self, enc_outs, lengths, + max_seq_len=20, bos=None, eos=None, + context=None, **conds): """ Decoding routine for inference with step-wise argmax procedure @@ -342,12 +378,14 @@ def predict_max(self, enc_outs, lengths, context=None, max_seq_len=20, =========== enc_outs : tensor(src_seq_len x batch x hidden_size) context : tensor(batch x hidden_size), optional + conds : {cond : tensor(batch)}, optional """ eos = eos or self.label_encoder.get_eos() bos = bos or self.label_encoder.get_bos() hidden, batch, device = None, enc_outs.size(1), enc_outs.device mask = torch.ones(batch, dtype=torch.int64, device=device) inp = torch.zeros(batch, dtype=torch.int64, device=device) + bos + conds = self.conds(**conds) if self.conds else None hyps, scores = [], 0 for _ in range(max_seq_len): @@ -356,6 +394,8 @@ def predict_max(self, enc_outs, lengths, context=None, max_seq_len=20, # prepare input emb = self.embs(inp) + if conds is not None: + emb = torch.cat([emb, conds], dim=1) if context is not None: emb = torch.cat([emb, context], dim=1) # run rnn @@ -378,8 +418,9 @@ def predict_max(self, enc_outs, lengths, context=None, max_seq_len=20, return hyps, scores - def predict_beam(self, enc_outs, lengths, context=None, max_seq_len=50, width=12, - eos=None, bos=None): + def predict_beam(self, enc_outs, lengths, + max_seq_len=50, width=12, eos=None, bos=None, + context=None, **conds): """ Decoding routine for inference with beam search @@ -387,6 +428,7 @@ def predict_beam(self, enc_outs, lengths, context=None, max_seq_len=50, width=12 =========== enc_outs : tensor(src_seq_len x batch x hidden_size) context : tensor(batch x hidden_size), optional + conds : {cond : tensor(batch)}, optional """ eos = eos or self.label_encoder.get_eos() bos = bos or self.label_encoder.get_bos() @@ -401,6 +443,8 @@ def predict_beam(self, enc_outs, lengths, context=None, max_seq_len=50, width=12 if context is not None: # (beam * batch x context_dim) context = context.repeat(width, 1) + if self.conds: + conds = self.conds(**conds).repeat(width, 1) for _ in range(max_seq_len): if all(not beam.active for beam in beams): @@ -411,6 +455,8 @@ def predict_beam(self, enc_outs, lengths, context=None, max_seq_len=50, width=12 inp = inp.view(-1) # (beam * batch x emb_dim) emb = self.embs(inp) + if conds is not None: + emb = torch.cat([emb, conds], dim=1) if context is not None: # (beam * batch x emb_dim + context_dim) emb = torch.cat([emb, context], dim=1) diff --git a/pie/models/model.py b/pie/models/model.py index 0adff37..a8ef8b6 100644 --- a/pie/models/model.py +++ b/pie/models/model.py @@ -24,6 +24,14 @@ def get_context(outs, wemb, wlen, context_type): return None +def get_conds(conds, flatten=False): + if flatten: + return {c: torch_utils.flatten_padded_batch(t, tlen) + for c, (t, tlen) in conds.items()} + else: + return {c: t for c, (t, _) in conds.items()} + + class SimpleModel(BaseModel): """ Parameters @@ -44,11 +52,11 @@ def __init__(self, label_encoder, tasks, wemb_dim, cemb_dim, hidden_size, num_la dropout=0.0, word_dropout=0.0, # word embeddings merge_type='concat', cemb_type='RNN', cemb_layers=1, - custom_cemb_cell=False, scorer='general', + custom_cemb_cell=False, # lm joint loss include_lm=True, lm_shared_softmax=True, # decoder - linear_layers=1, + scorer='general', linear_layers=1, cond_emb_dim=64, cond_out_dim=64, # kwargs **kwargs): # args @@ -63,11 +71,13 @@ def __init__(self, label_encoder, tasks, wemb_dim, cemb_dim, hidden_size, num_la self.merge_type = merge_type self.cemb_type = cemb_type self.cemb_layers = cemb_layers - self.scorer = scorer + self.custom_cemb_cell = custom_cemb_cell self.include_lm = include_lm self.lm_shared_softmax = lm_shared_softmax - self.custom_cemb_cell = custom_cemb_cell + self.scorer = scorer self.linear_layers = linear_layers + self.cond_emb_dim = cond_emb_dim + self.cond_out_dim = cond_out_dim # only during training self.init_rnn = init_rnn super().__init__(label_encoder, tasks) @@ -129,11 +139,10 @@ def __init__(self, label_encoder, tasks, wemb_dim, cemb_dim, hidden_size, num_la # Decoders decoders = {} for tname, task in self.tasks.items(): - if task['level'].lower() == 'char': - if self.cemb is None: - raise ValueError("Char-level decoder requires char embeddings") + # check conditions + cond_encoders = [label_encoder.tasks[t] for t in task.get('conditions', [])] - # TODO: add sentence context to decoder + if task['level'].lower() == 'char': if task['decoder'].lower() == 'linear': decoder = LinearDecoder( label_encoder.tasks[tname], self.cemb.embedding_dim) @@ -141,7 +150,6 @@ def __init__(self, label_encoder, tasks, wemb_dim, cemb_dim, hidden_size, num_la decoder = CRFDecoder( label_encoder.tasks[tname], self.cemb.embedding_dim) elif task['decoder'].lower() == 'attentional': - # get context size context_dim = 0 if task['context'].lower() == 'sentence': context_dim = hidden_size * 2 # bidirectional encoder @@ -149,12 +157,12 @@ def __init__(self, label_encoder, tasks, wemb_dim, cemb_dim, hidden_size, num_la context_dim = wemb_dim elif task['context'].lower() == 'both': context_dim = hidden_size * 2 + wemb_dim - decoder = AttentionalDecoder( label_encoder.tasks[tname], cemb_dim, self.cemb.embedding_dim, context_dim=context_dim, scorer=scorer, num_layers=cemb_layers, - cell=cell, dropout=dropout, init_rnn=init_rnn) - + cell=cell, dropout=dropout, init_rnn=init_rnn, + cond_label_encoders=tuple(cond_encoders), + cond_emb_dim=cond_emb_dim, cond_out_dim=cond_out_dim) else: raise ValueError( "Unknown decoder type {} for char-level task: {}".format( @@ -164,8 +172,10 @@ def __init__(self, label_encoder, tasks, wemb_dim, cemb_dim, hidden_size, num_la # linear if task['decoder'].lower() == 'linear': decoder = LinearDecoder( - label_encoder.tasks[tname], hidden_size * 2, - highway_layers=linear_layers - 1) + label_encoder.tasks[tname], hidden_size * 2, dropout=dropout, + highway_layers=linear_layers - 1, + cond_label_encoders=tuple(cond_encoders), + cond_emb_dim=cond_emb_dim, cond_out_dim=cond_out_dim) # crf elif task['decoder'].lower() == 'crf': decoder = CRFDecoder( @@ -197,6 +207,8 @@ def get_args_and_kwargs(self): 'cell': self.cell, 'merge_type': self.merge_type, 'linear_layers': self.linear_layers, + 'cond_emb_dim': self.cond_emb_dim, + 'cond_out_dim': self.cond_out_dim, 'cemb_type': self.cemb_type, 'cemb_layers': self.cemb_layers, 'include_lm': self.include_lm, @@ -282,11 +294,13 @@ def loss(self, batch_data, *target_tasks): cemb_outs = F.dropout( cemb_outs, p=self.dropout, training=self.training) context = get_context(outs, wemb, wlen, self.tasks[task]['context']) - logits = decoder(target, length, cemb_outs, clen, context) + logits = decoder( + target, length, cemb_outs, clen, + context=context, **get_conds(tasks, flatten=True)) output[task] = decoder.loss(logits, target) else: if isinstance(decoder, LinearDecoder): - logits = decoder(outs) + logits = decoder(outs, **get_conds(tasks)) output[task] = decoder.loss(logits, target) elif isinstance(decoder, CRFDecoder): logits = decoder(outs) @@ -308,7 +322,12 @@ def loss(self, batch_data, *target_tasks): return output - def predict(self, inp, *tasks, use_beam=False, beam_width=10, **kwargs): + def predict(self, inp, *tasks, conds={}, use_beam=False, beam_width=10, **kwargs): + """ + inp : (word, wlen), (char, clen), text input + tasks : list of str, target tasks + conds : {task : (t, tlen)}, optional, used for conditional decoding + """ tasks = set(self.tasks if not len(tasks) else tasks) preds = {} (word, wlen), (char, clen) = inp @@ -344,19 +363,20 @@ def predict(self, inp, *tasks, use_beam=False, beam_width=10, **kwargs): else: context = get_context(outs, wemb, wlen, self.tasks[task]['context']) if use_beam: - hyps, _ = decoder.predict_beam(cemb_outs, clen, - context=context, width=beam_width) + hyps, _ = decoder.predict_beam( + cemb_outs, clen, width=beam_width, + context=context, **get_conds(conds, flatten=True)) else: - hyps, _ = decoder.predict_max(cemb_outs, clen, context=context) + hyps, _ = decoder.predict_max( + cemb_outs, clen, + context=context, **get_conds(conds, flatten=True)) if self.label_encoder.tasks[task].preprocessor_fn is None: hyps = [''.join(hyp) for hyp in hyps] else: if isinstance(decoder, LinearDecoder): - hyps, _ = decoder.predict(outs, wlen) + hyps, _ = decoder.predict(outs, wlen, **get_conds(conds)) elif isinstance(decoder, CRFDecoder): hyps, _ = decoder.predict(outs, wlen) - else: - raise ValueError() preds[task] = hyps diff --git a/pie/scripts/train.py b/pie/scripts/train.py index 054fc2d..6828e07 100644 --- a/pie/scripts/train.py +++ b/pie/scripts/train.py @@ -90,17 +90,21 @@ def run(settings): logging.warning("No devset: cannot monitor/optimize training") # model - model = SimpleModel(label_encoder, settings.tasks, - settings.wemb_dim, settings.cemb_dim, settings.hidden_size, - settings.num_layers, dropout=settings.dropout, - cell=settings.cell, cemb_type=settings.cemb_type, - cemb_layers=settings.cemb_layers, - custom_cemb_cell=settings.custom_cemb_cell, - linear_layers=settings.linear_layers, - scorer=settings.scorer, - word_dropout=settings.word_dropout, - lm_shared_softmax=settings.lm_shared_softmax, - include_lm=settings.include_lm) + model = SimpleModel( + label_encoder, settings.tasks, + settings.wemb_dim, settings.cemb_dim, settings.hidden_size, settings.num_layers, + cell=settings.cell, + # dropout + dropout=settings.dropout, word_dropout=settings.word_dropout, + # word embeddings + merge_type=settings.merge_type, cemb_type=settings.cemb_type, + cemb_layers=settings.cemb_layers, custom_cemb_cell=settings.custom_cemb_cell, + # lm joint loss + include_lm=settings.include_lm, lm_shared_softmax=settings.lm_shared_softmax, + # decoder + scorer=settings.scorer, linear_layers=settings.linear_layers, + cond_emb_dim=settings.cond_emb_dim, cond_out_dim=settings.cond_out_dim + ) # pretrain(/load pretrained) embeddings if model.wemb is not None: diff --git a/pie/settings.py b/pie/settings.py index 9c44921..8473bb4 100644 --- a/pie/settings.py +++ b/pie/settings.py @@ -84,12 +84,23 @@ def parse_env_settings(defaults): def check_settings(settings): - # - check at least and at most one target has_target = False + tasks = set(task['name'] for task in settings.tasks) + for task in settings.tasks: + # - check input char embeddings for attentional decoder + if task['decoder'] == 'attentional': + if settings.cemb_type.lower() not in ('rnn', 'cnn'): + raise ValueError("Attentional decoder needs character embeddings") + # - check conditions + for task2 in task.get('conditions', []): + if task2 not in tasks: + raise ValueError("Task '{}' requires task '{}'".format(task, task2)) + # - check at least and at most one target if len(settings.tasks) == 1: task['target'] = True if task.get('target', False): + # - check target if task.get('read_only'): raise ValueError("Target task cannot be 'read_only'") if has_target: From 7b77dd479b76b29e647757c8483f5f452f5e2b47 Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Thu, 9 May 2019 17:19:37 +0200 Subject: [PATCH 18/46] added example opt.json file --- opt.json | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 opt.json diff --git a/opt.json b/opt.json new file mode 100644 index 0000000..544144e --- /dev/null +++ b/opt.json @@ -0,0 +1,70 @@ + +{ + "lr": { + "opt": "truncnorm", + "params": { + "mu": 0.0025, + "std": 0.002, + "lower": 0.0001, + "upper": 1 + } + }, + + "dropout": { + "opt": "truncnorm", + "params": { + "mu": 0.2, + "std": 0.15, + "upper": 0.5 + } + }, + + "custom_cemb_cell": { + "opt": "choice", + "params": [true, false] + }, + + "tasks": [ // tasks from main config file get overwritten by this tasks + { + "name": "pos", + "target": true, + "decoder": { + "opt": "choice", + "params": ["linear", "crf"] + } + }, + { + "name": "morph" + } + ], + + "include_lm": { + "opt": "choice", + "params": [true, false] + }, + + "num_layers": { + "opt": "choice", + "params": [1, 2] + }, + + "cemb_dim": { + "opt": "normint", + "params": { + "mu": 100, + "std": 50, + "lower": 50, + "upper": 500 + } + }, + + "hidden_size": { + "opt": "normint", + "params": { + "mu": 150, + "std": 100, + "lower": 50, + "upper": 500 + } + } +} From 56962f02aaafe7b4122001a506e58d9d60837ca9 Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Thu, 30 May 2019 12:11:12 +0200 Subject: [PATCH 19/46] fixes --- pie/data/dataset.py | 2 ++ pie/models/decoder.py | 5 ++--- pie/scripts/evaluate.py | 18 ++++++++++++++---- pie/tagger.py | 38 ++++++++++++++++++++++++++++---------- pie/trainer.py | 7 ++++--- 5 files changed, 50 insertions(+), 20 deletions(-) diff --git a/pie/data/dataset.py b/pie/data/dataset.py index 800167b..7d538d2 100644 --- a/pie/data/dataset.py +++ b/pie/data/dataset.py @@ -332,6 +332,8 @@ def fit(self, lines): for le in self.tasks.values(): le.compute_vocab() + return self + def fit_reader(self, reader): """ fit reader in a non verbose way (to warn about parsing issues) diff --git a/pie/models/decoder.py b/pie/models/decoder.py index 855bbcf..49be273 100644 --- a/pie/models/decoder.py +++ b/pie/models/decoder.py @@ -17,7 +17,7 @@ class ConditionEmbedding(nn.Module): """ Embed tags and project onto a fixed-size tag embedding """ - def __init__(self, label_encoders, emb_dim, out_features, dropout=0): + def __init__(self, label_encoders, emb_dim, out_features, dropout=0.0): self.dropout = dropout super().__init__() @@ -443,8 +443,7 @@ def predict_beam(self, enc_outs, lengths, if context is not None: # (beam * batch x context_dim) context = context.repeat(width, 1) - if self.conds: - conds = self.conds(**conds).repeat(width, 1) + conds = self.conds(**conds).repeat(width, 1) if self.conds else None for _ in range(max_seq_len): if all(not beam.active for beam in beams): diff --git a/pie/scripts/evaluate.py b/pie/scripts/evaluate.py index 586a86e..ebec58e 100644 --- a/pie/scripts/evaluate.py +++ b/pie/scripts/evaluate.py @@ -7,18 +7,24 @@ from pie.settings import load_default_settings, settings_from_file -def run(model_path, test_path, train_path, - settings, batch_size, buffer_size, device, model_info, full, confusion): +def run(model_path, test_path, train_path, # data + settings, batch_size, buffer_size, use_beam, beam_width, device, # decoding + model_info, full, confusion # output + ): model = BaseModel.load(model_path).to(device) if model_info: print(model) + + # settings if hasattr(model, '_settings'): # new models should all have _settings settings = model._settings elif settings: + print("Using user specified settings file: {}".format(settings)) with utils.shutup(): settings = settings_from_file(settings) else: + print("Warning! Using default settings") with utils.shutup(): settings = load_default_settings() @@ -28,14 +34,15 @@ def run(model_path, test_path, train_path, settings.device = device settings.shuffle = False # avoid shuffling + # read datasets trainset = None if train_path: trainset = Dataset( settings, Reader(settings, train_path), model.label_encoder) - testset = Dataset(settings, Reader(settings, *test_path), model.label_encoder) - for task in model.evaluate(testset, trainset).values(): + # evaluate + for task in model.evaluate(testset, trainset, use_beam=use_beam).values(): task.print_summary(full=full, confusion_matrix=confusion) @@ -49,6 +56,8 @@ def run(model_path, test_path, train_path, parser.add_argument('--settings', help="settings file used for training") parser.add_argument('--batch_size', type=int, default=500) parser.add_argument('--buffer_size', type=int, default=100000) + parser.add_argument('--use_beam', action='store_true') + parser.add_argument('--beam_width', type=int, default=12) parser.add_argument('--device', default='cpu') parser.add_argument('--model_info', action='store_true') parser.add_argument('--full', action='store_true') @@ -57,5 +66,6 @@ def run(model_path, test_path, train_path, run(model_path=args.model_path, test_path=args.test_path, train_path=args.train_path, settings=args.settings, batch_size=args.batch_size, buffer_size=args.buffer_size, + use_beam=args.use_beam, beam_width=args.beam_width, device=args.device, model_info=args.model_info, full=args.full, confusion=args.confusion) diff --git a/pie/tagger.py b/pie/tagger.py index 7c0e5b5..607bc8c 100644 --- a/pie/tagger.py +++ b/pie/tagger.py @@ -35,18 +35,36 @@ def simple_tokenizer(text, lower): yield sentence -def lines_from_file(fpath, lower=False): +def lines_from_file(fpath, lower=False, tokenize=False, max_sent_len=35): + """ + lower : bool, whether to lowercase output + tokenize : bool, whether to use simple_tokenizer + max_sent_len : int, only applicable if tokenize is False + """ with open(fpath) as f: for line in f: - for sentence in simple_tokenizer(line, lower): - yield sentence, len(sentence) + sentence = [] + if not tokenize: + for w in line.split(): + if len(sentence) >= max_sent_len: + yield sentence, len(sentence) + sentence = [] + sentence.append(w.lower() if lower else w) + else: + for sentence in simple_tokenizer(line, lower): + yield sentence, len(sentence) + sentence = [] + + if sentence: # yield remaining words when tokenize is False + yield sentence, len(sentence) class Tagger(): - def __init__(self, device='cpu', batch_size=100, lower=False): + def __init__(self, device='cpu', batch_size=100, lower=False, tokenize=False): self.device = device self.batch_size = batch_size self.lower = lower + self.tokenize = tokenize self.models = [] def add_model(self, model_path, *tasks): @@ -67,6 +85,7 @@ def tag(self, sents, lengths, **kwargs): output = {} for model, tasks in self.models: model.to(self.device) + inp, _ = pack_batch(model.label_encoder, batch, self.device) # inference @@ -110,11 +129,9 @@ def tag_file(self, fpath, sep='\t', **kwargs): header = False with open(utils.ensure_ext(fpath, ext, 'pie'), 'w+') as f: - - for chunk in utils.chunks( - lines_from_file(fpath, self.lower), self.batch_size): - sents, lengths = zip(*chunk) - tagged, tasks = self.tag(sents, lengths, **kwargs) + lines = lines_from_file(fpath, lower=self.lower, tokenize=self.tokenize) + for chunk in utils.chunks(lines, self.batch_size): + tagged, tasks = self.tag(*zip(*chunk), **kwargs) for sent in tagged: if not header: @@ -123,4 +140,5 @@ def tag_file(self, fpath, sep='\t', **kwargs): for token, tags in sent: f.write(sep.join([token] + list(tags)) + '\n') - f.write('\n') + if self.tokenize: + f.write('\n') diff --git a/pie/trainer.py b/pie/trainer.py index 5118f7e..1d58a4d 100644 --- a/pie/trainer.py +++ b/pie/trainer.py @@ -62,12 +62,12 @@ def __init__(self, settings): tasks[task['name']]['target'] = task.get('target', False) # add task data for lm loss if settings.include_lm: - tasks['lm_fwd'] = settings.lm_schedule - tasks['lm_bwd'] = settings.lm_schedule + tasks['lm_fwd'] = dict(settings.lm_schedule) + tasks['lm_bwd'] = dict(settings.lm_schedule) for task, tdata in tasks.items(): # set step counter - tdata['step'] = 0 + tdata['steps'] = 0 # set default task mode tdata['mode'] = tdata.get('mode', 'max') # set initial weight @@ -212,6 +212,7 @@ def __init__(self, settings, model, dataset, num_instances): print(self.task_scheduler) print() print("::: LR schedule :::") + print() print(self.lr_scheduler) print() From 6b4bf701fee635d5ce2c05c86f67f230784a7860 Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Thu, 30 May 2019 12:12:26 +0200 Subject: [PATCH 20/46] minor --- opt.json | 4 ++-- pie/data/dataset.py | 5 ++++- pie/default_settings.json | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/opt.json b/opt.json index 544144e..c4a227f 100644 --- a/opt.json +++ b/opt.json @@ -29,8 +29,8 @@ "name": "pos", "target": true, "decoder": { - "opt": "choice", - "params": ["linear", "crf"] + "opt": "choice", + "params": ["linear", "crf"] } }, { diff --git a/pie/data/dataset.py b/pie/data/dataset.py index 7d538d2..c60624e 100644 --- a/pie/data/dataset.py +++ b/pie/data/dataset.py @@ -289,7 +289,10 @@ def add_task(self, name, **meta): # check (not suitable for linear models) if meta['level'].lower() != 'char' and (meta.get('eos') or meta.get('bos')): raise ValueError( - '[Task: {task}] => `bos` and `eos` options are only compatible with char-level tasks but got level: "{level}". Aborting!!!'.format(task=name, level=meta['level'])) + ('[Task: {task}] => `bos` and `eos` options are ' + 'only compatible with char-level tasks but got ' + 'level: "{level}". Aborting!!!').format( + task=name, level=meta['level'])) return self diff --git a/pie/default_settings.json b/pie/default_settings.json index e2cef7c..5cb4139 100644 --- a/pie/default_settings.json +++ b/pie/default_settings.json @@ -103,7 +103,7 @@ // * Training "buffer_size": 10000, // maximum number of sentence in memory at any given time "minimize_pad": false, // preprocess data to have similar sentence lengths inside batch - "epochs": 5, // number of epochs + "epochs": 500, // number of epochs "batch_size": 50, // batch size "shuffle": true, // whether to shuffle input batches "device": "cpu", // device to be used for training (use cuda:device_number for GPU) From 7f19b0a5b334221d8ca4eaf19895e3a70f3e52c5 Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Thu, 30 May 2019 12:12:39 +0200 Subject: [PATCH 21/46] abstract stats function over readers --- pie/models/scorer.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pie/models/scorer.py b/pie/models/scorer.py index afd06af..2345121 100644 --- a/pie/models/scorer.py +++ b/pie/models/scorer.py @@ -10,9 +10,9 @@ from pie import constants -def get_ambiguous_tokens(trainset, label_encoder): +def get_ambiguous_tokens(reader, label_encoder): ambs = defaultdict(Counter) - for _, (inp, tasks) in trainset.reader.readsents(): + for _, (inp, tasks) in reader.readsents(): trues = label_encoder.preprocess(tasks[label_encoder.target], inp) for tok, true in zip(inp, trues): ambs[tok][true] += 1 @@ -20,9 +20,9 @@ def get_ambiguous_tokens(trainset, label_encoder): return set(tok for tok in ambs if len(ambs[tok]) > 1) -def get_known_tokens(trainset): +def get_known_tokens(reader): known = set() - for _, (inp, _) in trainset.reader.readsents(): + for _, (inp, _) in reader.readsents(): for tok in inp: known.add(tok) return known @@ -49,8 +49,8 @@ def __init__(self, label_encoder, trainset=None): self.label_encoder = label_encoder self.known_tokens = self.amb_tokens = None if trainset: - self.known_tokens = get_known_tokens(trainset) - self.amb_tokens = get_ambiguous_tokens(trainset, label_encoder) + self.known_tokens = get_known_tokens(trainset.reader) + self.amb_tokens = get_ambiguous_tokens(trainset.reader, label_encoder) self.preds = [] self.trues = [] self.tokens = [] From 8035d075f6b4208981913454de3fd50d3ab68336 Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Tue, 21 Apr 2020 12:29:05 +0200 Subject: [PATCH 22/46] Added beam options to evaluate --- pie/scripts/evaluate.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pie/scripts/evaluate.py b/pie/scripts/evaluate.py index 73bb2bc..7b3d9fb 100644 --- a/pie/scripts/evaluate.py +++ b/pie/scripts/evaluate.py @@ -77,8 +77,10 @@ def run(model_path, test_path, train_path, # data parser.add_argument('--model_info', action='store_true') parser.add_argument('--full', action='store_true') parser.add_argument('--confusion', default=False, action="store_true") - parser.add_argument('--report', default=False, action="store_true", help="Get full report on each class") - parser.add_argument('--markdown', default=False, action="store_true", help="Use Markdown") + parser.add_argument('--report', default=False, + action="store_true", help="Get full report on each class") + parser.add_argument('--markdown', default=False, + action="store_true", help="Use Markdown") args = parser.parse_args() run(model_path=args.model_path, test_path=args.test_path, train_path=args.train_path, settings=args.settings, From fc87d00cccbf428ac1888db52e64fb303b3f0fbb Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Fri, 24 Apr 2020 11:07:56 +0200 Subject: [PATCH 23/46] deprecation fix --- pie/models/decoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pie/models/decoder.py b/pie/models/decoder.py index d7ba31c..8179462 100644 --- a/pie/models/decoder.py +++ b/pie/models/decoder.py @@ -37,7 +37,7 @@ def forward(self, inp): current = inp for layer in self.layers: inp, gate = layer(current).chunk(2, dim=-1) - inp, gate = getattr(F, self.act)(inp), F.sigmoid(gate) + inp, gate = getattr(F, self.act)(inp), torch.sigmoid(gate) current = gate * current + (1 - gate) * inp return current From 73156e93d9f0b13222ca28ed634140dc015a6af9 Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Fri, 24 Apr 2020 11:08:48 +0200 Subject: [PATCH 24/46] highway was broken :-s --- pie/models/decoder.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pie/models/decoder.py b/pie/models/decoder.py index 8179462..db6ca6c 100644 --- a/pie/models/decoder.py +++ b/pie/models/decoder.py @@ -94,12 +94,15 @@ def predict(self, enc_outs, lengths): ========== enc_outs : torch.tensor(seq_len x batch x hidden_size) """ + if self.highway is not None: + enc_outs = self.highway(enc_outs) probs = F.softmax(self.decoder(enc_outs), dim=-1) probs, preds = torch.max(probs.transpose(0, 1), dim=-1) output_probs, output_preds = [], [] for idx, length in enumerate(lengths.tolist()): - output_preds.append(self.label_encoder.inverse_transform(preds[idx])[:length]) + output_preds.append( + self.label_encoder.inverse_transform(preds[idx])[:length]) output_probs.append(probs[idx].tolist()) return output_preds, output_probs @@ -210,6 +213,8 @@ def loss(self, logits, targets, lengths): return torch.mean(Z - score) def predict(self, enc_outs, lengths): + if self.highway is None: + enc_out = self.highway(enc_outs) # (seq_len x batch x vocab) logits = self.projection(enc_outs) seq_len, _, vocab = logits.size() From e0edf12457420743bccaeb3491dcff1fbd5aba59 Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Fri, 24 Apr 2020 11:10:19 +0200 Subject: [PATCH 25/46] add option to cache batches (speed up for lengthy preprocessing workflows) --- pie/data/dataset.py | 27 +++++++++++++++++++++++++++ pie/default_settings.json | 1 + 2 files changed, 28 insertions(+) diff --git a/pie/data/dataset.py b/pie/data/dataset.py index e0fc62c..8dd67e4 100644 --- a/pie/data/dataset.py +++ b/pie/data/dataset.py @@ -459,10 +459,12 @@ def __init__(self, settings, reader, label_encoder): self.device = settings.device self.shuffle = settings.shuffle self.minimize_pad = settings.minimize_pad + self.cache_dataset = settings.cache_dataset # data self.reader = reader self.label_encoder = label_encoder + self.cached = [] @staticmethod def get_nelement(batch): @@ -510,6 +512,23 @@ def batch_generator(self, return_raw=False): - char : tensor(length, batch_size * words), padded lengths * (tasks) dictionary with tasks """ + if self.cache_dataset: + if not self.cached: + self.cache_batches() + if self.shuffle: + random.shuffle(self.cached) + + for batch, raw in self.cached: + # move to device + batch = tuple(list(wrap_device(batch, self.device))) + if return_raw: + yield batch, raw + else: + yield batch + else: + yield from self.batch_generator_(return_raw=return_raw) + + def batch_generator_(self, return_raw=False): buf = [] for (fpath, line_num), data in self.reader.readsents(): @@ -524,6 +543,14 @@ def batch_generator(self, return_raw=False): if len(buf) > 0: yield from self.prepare_buffer(buf, return_raw=return_raw) + def cache_batches(self): + if self.cached: + return + + buf = [data for _, data in self.reader.readsents()] + for batch, raw in self.prepare_buffer(buf, return_raw=True, device='cpu'): + self.cached.append((batch, raw)) + def pack_batch(label_encoder, batch, device=None): """ diff --git a/pie/default_settings.json b/pie/default_settings.json index 908c0f2..1b5ff9e 100644 --- a/pie/default_settings.json +++ b/pie/default_settings.json @@ -97,6 +97,7 @@ // * Training "buffer_size": 10000, // maximum number of sentence in memory at any given time + "cache_dataset": false, // precompute batches and keep them in memory "minimize_pad": false, // preprocess data to have similar sentence lengths inside batch "epochs": 5, // number of epochs "batch_size": 50, // batch size From bf29618890db7b3eeab271cfba7c19c0bfda7dfc Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Fri, 24 Apr 2020 12:27:25 +0200 Subject: [PATCH 26/46] added optimize (random search) --- pie/__init__.py | 2 +- pie/{scripts => }/optimize.py | 27 +++++---------------------- pie/scripts/train.py | 30 +++++++++++++++++------------- 3 files changed, 23 insertions(+), 36 deletions(-) rename pie/{scripts => }/optimize.py (80%) diff --git a/pie/__init__.py b/pie/__init__.py index 57ecc16..66f581d 100644 --- a/pie/__init__.py +++ b/pie/__init__.py @@ -1,5 +1,4 @@ - from .utils import GitInfo try: @@ -22,6 +21,7 @@ from . import settings from . import tagger from . import initialization +from . import optimize from .data import * from .models import * from .pretrain_encoder import Encoder diff --git a/pie/scripts/optimize.py b/pie/optimize.py similarity index 80% rename from pie/scripts/optimize.py rename to pie/optimize.py index 17356ec..1742b6c 100644 --- a/pie/scripts/optimize.py +++ b/pie/optimize.py @@ -7,7 +7,8 @@ import scipy.stats as stats from pie import utils -from pie import settings +from pie.settings import settings_from_file, check_settings, merge_task_defaults +from pie.settings import Settings # available distributions @@ -95,32 +96,14 @@ def sample_from_config(opt): return output -def run(config, opt, n_iter): - import train - +def run_optimize(train_fn, config, opt, n_iter, **kwargs): for i in range(n_iter): print() print("::: Starting optimization run {} :::".format(i + 1)) print() sampled = sample_from_config(opt) - merged = settings.Settings( + merged = Settings( utils.recursive_merge(dict(config), sampled, overwrite=True)) print("::: Sampled config :::") print(yaml.dump(dict(merged))) - train.run(settings.check_settings(settings.merge_task_defaults(merged))) - - -if __name__ == "__main__": - import argparse - parser = argparse.ArgumentParser() - parser.add_argument('config_path', default='config.json') - parser.add_argument('opt_path', help='Path to optimization file (see opt.json)') - parser.add_argument('--n_iter', type=int, default=20) - args = parser.parse_args() - - with utils.shutup(): - config = settings.settings_from_file(args.config_path) - - opt = read_opt(args.opt_path) - - run(config, opt, args.n_iter) + train_fn(check_settings(merge_task_defaults(merged)), **kwargs) diff --git a/pie/scripts/train.py b/pie/scripts/train.py index 6828e07..5fa2231 100644 --- a/pie/scripts/train.py +++ b/pie/scripts/train.py @@ -5,15 +5,15 @@ import logging from datetime import datetime - import pie -from pie import utils from pie.settings import settings_from_file from pie.trainer import Trainer from pie import initialization from pie.data import Dataset, Reader, MultiLabelEncoder from pie.models import SimpleModel, get_pretrained_embeddings +from pie import optimize +# set seeds import random import numpy import torch @@ -32,12 +32,9 @@ def get_fname_infix(settings): def run(settings): - # read settings if input is path - if isinstance(settings, str): - settings = settings_from_file(settings) - - # seeding now = datetime.now() + + # set seed seed = now.hour * 10000 + now.minute * 100 + now.second print("Using seed:", seed) random.seed(seed) @@ -92,8 +89,8 @@ def run(settings): # model model = SimpleModel( label_encoder, settings.tasks, - settings.wemb_dim, settings.cemb_dim, settings.hidden_size, settings.num_layers, - cell=settings.cell, + settings.wemb_dim, settings.cemb_dim, settings.hidden_size, + settings.num_layers, cell=settings.cell, # dropout dropout=settings.dropout, word_dropout=settings.word_dropout, # word embeddings @@ -102,9 +99,7 @@ def run(settings): # lm joint loss include_lm=settings.include_lm, lm_shared_softmax=settings.lm_shared_softmax, # decoder - scorer=settings.scorer, linear_layers=settings.linear_layers, - cond_emb_dim=settings.cond_emb_dim, cond_out_dim=settings.cond_out_dim - ) + scorer=settings.scorer, linear_layers=settings.linear_layers) # pretrain(/load pretrained) embeddings if model.wemb is not None: @@ -197,5 +192,14 @@ def run(settings): import argparse parser = argparse.ArgumentParser() parser.add_argument('config_path', nargs='?', default='config.json') + parser.add_argument('--opt_path', help='Path to optimization file (see opt.json)') + parser.add_argument('--n_iter', type=int, default=20) args = parser.parse_args() - run(args.config_path) + + settings = settings_from_file(args.config_path) + + if args.opt_path: + opt = optimize.read_opt(args.opt_path) + optimize.run_optimize(run, settings, opt, args.n_iter) + else: + run(settings) From c84e9687d10fa7c05eb429a1dfcdac1276bd22f2 Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Fri, 24 Apr 2020 15:41:33 +0200 Subject: [PATCH 27/46] minimum formatting changes --- pie/models/scorer.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/pie/models/scorer.py b/pie/models/scorer.py index 28589e3..eb3c326 100644 --- a/pie/models/scorer.py +++ b/pie/models/scorer.py @@ -257,8 +257,8 @@ def error_summary(true, count, preds): return '\n'.join(summary) - def print_summary(self, full=False, most_common=100, confusion_matrix=False, scores=None, - report=False, markdown=True): + def print_summary(self, full=False, most_common=100, confusion_matrix=False, + scores=None, report=False, markdown=True): """ Get evaluation summary @@ -272,7 +272,8 @@ def print_summary(self, full=False, most_common=100, confusion_matrix=False, sco if markdown: print("## " + self.label_encoder.name) else: - print("::: Evaluation report for task: {} :::".format(self.label_encoder.name)) + print("::: Evaluation report for task: {} :::".format( + self.label_encoder.name)) print() if scores is None: @@ -280,7 +281,7 @@ def print_summary(self, full=False, most_common=100, confusion_matrix=False, sco # print scores if markdown: - print(self.scores_in_markdown(scores)) + print(self.scores_in_markdown(scores) + '\n') else: print(yaml.dump(scores, default_flow_style=False)) @@ -289,7 +290,8 @@ def print_summary(self, full=False, most_common=100, confusion_matrix=False, sco if markdown: print("### Error summary for task {}".format(self.label_encoder.name)) else: - print("::: Error summary for task: {} :::".format(self.label_encoder.name)) + print("::: Error summary for task: {} :::".format( + self.label_encoder.name)) print() if self.label_encoder.level == 'char': print(self.get_transduction_summary(most_common=most_common)) @@ -312,18 +314,18 @@ def print_summary(self, full=False, most_common=100, confusion_matrix=False, sco else: print("::: Confusion Matrix :::") print() - print((github_table.GithubFlavoredMarkdownTable(self.get_confusion_matrix_table())).table) + print(github_table.GithubFlavoredMarkdownTable( + self.get_confusion_matrix_table()).table) def get_classification_report(self): return classification_report( y_true=self.trues, - y_pred=self.preds - ) + y_pred=self.preds) @staticmethod def scores_in_markdown(scores): measures = ["accuracy", "precision", "recall", "support"] - table = [[""]+measures] + table = [[""] + measures] for key in scores: table.append([key, *[scores[key][meas] for meas in measures]]) @@ -374,4 +376,4 @@ def classification_report(y_true, y_pred, digits=2): str(np.sum(s))) tbl_rows.append(last_row) - return (github_table.GithubFlavoredMarkdownTable([headers]+tbl_rows)).table + return github_table.GithubFlavoredMarkdownTable([headers] + tbl_rows).table From 8c997e861deb20709e1f44fd05a082f8c32bb07b Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Fri, 24 Apr 2020 15:54:57 +0200 Subject: [PATCH 28/46] rm read_only --- pie/models/base_model.py | 2 -- pie/settings.py | 3 --- pie/trainer.py | 3 --- 3 files changed, 8 deletions(-) diff --git a/pie/models/base_model.py b/pie/models/base_model.py index 3f5f536..0f9a2be 100644 --- a/pie/models/base_model.py +++ b/pie/models/base_model.py @@ -29,8 +29,6 @@ def __init__(self, label_encoder, tasks, *args, **kwargs): # prepare input task data from task settings if isinstance(tasks, list): tasks = {task['name']: task for task in tasks} - # drop read-only tasks - self.tasks = {t: task for t, task in tasks.items() if not task.get('read_only')} super().__init__() def loss(self, batch_data): diff --git a/pie/settings.py b/pie/settings.py index 8473bb4..d2a266a 100644 --- a/pie/settings.py +++ b/pie/settings.py @@ -100,9 +100,6 @@ def check_settings(settings): if len(settings.tasks) == 1: task['target'] = True if task.get('target', False): - # - check target - if task.get('read_only'): - raise ValueError("Target task cannot be 'read_only'") if has_target: raise ValueError("Got more than one target task") has_target = True diff --git a/pie/trainer.py b/pie/trainer.py index 2ea2184..f8dcf1d 100644 --- a/pie/trainer.py +++ b/pie/trainer.py @@ -54,9 +54,6 @@ def __init__(self, settings): tasks = {} # preprocess tasks for task in settings.tasks: - # ignore read-only - if task.get('read_only'): - continue # add schedule and target tasks[task['name']] = task.get('schedule', {}) tasks[task['name']]['target'] = task.get('target', False) From 882bb70faca655ead11ce3ad15eded9b389b33de Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Fri, 24 Apr 2020 16:30:59 +0200 Subject: [PATCH 29/46] fixes --- pie/models/decoder.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pie/models/decoder.py b/pie/models/decoder.py index a20e579..e56a87d 100644 --- a/pie/models/decoder.py +++ b/pie/models/decoder.py @@ -44,11 +44,9 @@ def init(self): # linear initialization.init_linear(self.decoder) - def forward(self, enc_outs, **conds): + def forward(self, enc_outs): if self.highway is not None: enc_outs = self.highway(enc_outs) - if self.conds: - enc_outs = torch.cat([enc_outs, self.conds(**conds)], -1) linear_out = self.decoder(enc_outs) return linear_out From dc131e1bc9f64d6704e730e5bc088c787d42be5c Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Fri, 24 Apr 2020 16:31:15 +0200 Subject: [PATCH 30/46] train run changed signature --- pie/scripts/group.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pie/scripts/group.py b/pie/scripts/group.py index d70d23f..0db9222 100644 --- a/pie/scripts/group.py +++ b/pie/scripts/group.py @@ -82,7 +82,8 @@ def evaluate(model_path, test_path, train_path, settings, batch_size, def train(config_path): """ Train a model using the file at [CONFIG_PATH]""" import pie.scripts.train - pie.scripts.train.run(config_path=config_path) + import pie.settings + pie.scripts.train.run(settings.settings_from_file(config_path)) if __name__ == "__main__": From 0869987ce600fd9af3624e7fa204bf7886b4674a Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Fri, 24 Apr 2020 16:38:37 +0200 Subject: [PATCH 31/46] cosmetic --- pie/models/base_model.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pie/models/base_model.py b/pie/models/base_model.py index 7cf587b..c201d50 100644 --- a/pie/models/base_model.py +++ b/pie/models/base_model.py @@ -36,7 +36,8 @@ def __init__(self, label_encoder, tasks, *args, **kwargs): super().__init__() def get_scorer(self, task, trainset=None): - """ Given a task, gets a scorer. Trainset can be user for computing unknown and ambiguous tokens. + """ Given a task, gets a scorer. Trainset can be user for computing + unknown and ambiguous tokens. :param task: Taskname (str) :param trainset: Dataset for training @@ -44,7 +45,8 @@ def get_scorer(self, task, trainset=None): """ scorer = Scorer(self.label_encoder.tasks[task]) if not self._fitted_trainset_scorer and trainset: - self.known, self.ambs = get_known_and_ambigous_tokens(trainset, list(self.label_encoder.tasks.values())) + self.known, self.ambs = get_known_and_ambigous_tokens( + trainset, list(self.label_encoder.tasks.values())) self._fitted_trainset_scorer = True scorer.set_known_and_amb(self.known, self.ambs[task]) return scorer From e140024a08641297b8775ad0f59128131e2255a1 Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Sat, 25 Apr 2020 12:15:59 +0200 Subject: [PATCH 32/46] Added docstring to run_optimize --- pie/optimize.py | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/pie/optimize.py b/pie/optimize.py index 1742b6c..cbb86ea 100644 --- a/pie/optimize.py +++ b/pie/optimize.py @@ -96,14 +96,43 @@ def sample_from_config(opt): return output -def run_optimize(train_fn, config, opt, n_iter, **kwargs): +def run_optimize(train_fn, settings, opt, n_iter, **kwargs): + """ + Run random search over given `settings` resampling parameters as + specified by `opt` for `n_iter` using `train_fn` function. + + - train_fn: a function that takes settings and any other possible kwargs + and runs a training procedure + - settings: a Settings object fully determining a training run + - opt: a sampling file specifying parameters to resample each run, + including a distribution to sample from. The contents are read from + a json file with the following structure. + { "lr": { + "opt": "truncnorm", + "params": { + "mu": 0.0025, "std": 0.002, "lower": 0.0001, "upper": 1 + } + } + } + "opt" specifies the distribution, and "params" the required parameters + for that distribution: + - "truncnorm": truncated normal + - params: mu, std, lower, upper + - "choice": uniform over given options + - params: list of options + - "normint": same as "truncnorm" but output is round up to an integer + + Other distributions can be implemented in the future. + + - n_iter: int, number of iterations to run + """ for i in range(n_iter): print() print("::: Starting optimization run {} :::".format(i + 1)) print() sampled = sample_from_config(opt) merged = Settings( - utils.recursive_merge(dict(config), sampled, overwrite=True)) - print("::: Sampled config :::") + utils.recursive_merge(dict(settings), sampled, overwrite=True)) + print("::: Sampled settings :::") print(yaml.dump(dict(merged))) train_fn(check_settings(merge_task_defaults(merged)), **kwargs) From ddad6818073e8592b13e03d9e43594b2284b7e4e Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Sat, 25 Apr 2020 12:16:55 +0200 Subject: [PATCH 33/46] abstracted out build_embeddings --- pie/models/embedding.py | 46 ++++++++++++++++++++++++++++++ pie/models/model.py | 63 +++++++++-------------------------------- 2 files changed, 60 insertions(+), 49 deletions(-) diff --git a/pie/models/embedding.py b/pie/models/embedding.py index d47891d..535e170 100644 --- a/pie/models/embedding.py +++ b/pie/models/embedding.py @@ -165,6 +165,52 @@ def func(wemb, cemb): return func +def build_embeddings(label_encoder, wemb_dim, + cemb_dim, cemb_type, custom_cemb_cell, cemb_layers, cell, init_rnn, + merge_type, dropout): + """ + Utility function to build embedding layers + """ + wemb = None + if wemb_dim > 0: + wemb = nn.Embedding(len(label_encoder.word), wemb_dim, + padding_idx=label_encoder.word.get_pad()) + # init embeddings + initialization.init_embeddings(wemb) + + cemb = None + if cemb_type.upper() == 'RNN': + cemb = RNNEmbedding(len(label_encoder.char), cemb_dim, + padding_idx=label_encoder.char.get_pad(), + custom_lstm=custom_cemb_cell, dropout=dropout, + num_layers=cemb_layers, cell=cell, init_rnn=init_rnn) + elif cemb_type.upper() == 'CNN': + cemb = CNNEmbedding(len(label_encoder.char), cemb_dim, + padding_idx=label_encoder.char.get_pad()) + + merger = None + if cemb is not None and wemb is not None: + if merge_type.lower() == 'mixer': + if cemb.embedding_dim != wemb.embedding_dim: + raise ValueError("EmbeddingMixer needs equal embedding dims") + merger = EmbeddingMixer(wemb_dim) + in_dim = wemb_dim + elif merge_type == 'concat': + merger = EmbeddingConcat() + in_dim = wemb_dim + cemb.embedding_dim + else: + raise ValueError("Unknown merge method: {}".format(merge_type)) + elif cemb is None: + in_dim = wemb_dim + else: + in_dim = cemb.embedding_dim + + return (wemb, cemb, merger), in_dim + + +def get_embeddings() + + if __name__ == '__main__': from pie.settings import settings_from_file from pie.data import Dataset diff --git a/pie/models/model.py b/pie/models/model.py index 6aa1f7b..6c8a54b 100644 --- a/pie/models/model.py +++ b/pie/models/model.py @@ -5,7 +5,7 @@ from pie import torch_utils, initialization -from .embedding import RNNEmbedding, CNNEmbedding, EmbeddingMixer, EmbeddingConcat +from .embedding import build_embeddings from .decoder import AttentionalDecoder, LinearDecoder, CRFDecoder from .encoder import RNNEncoder from .base_model import BaseModel @@ -73,41 +73,10 @@ def __init__(self, label_encoder, tasks, wemb_dim, cemb_dim, hidden_size, num_la super().__init__(label_encoder, tasks) # Embeddings - self.wemb = None - if self.wemb_dim > 0: - self.wemb = nn.Embedding(len(label_encoder.word), wemb_dim, - padding_idx=label_encoder.word.get_pad()) - # init embeddings - initialization.init_embeddings(self.wemb) - - self.cemb = None - if cemb_type.upper() == 'RNN': - self.cemb = RNNEmbedding( - len(label_encoder.char), cemb_dim, - padding_idx=label_encoder.char.get_pad(), - custom_lstm=custom_cemb_cell, dropout=dropout, - num_layers=cemb_layers, cell=cell, init_rnn=init_rnn) - elif cemb_type.upper() == 'CNN': - self.cemb = CNNEmbedding( - len(label_encoder.char), cemb_dim, - padding_idx=label_encoder.char.get_pad()) - - self.merger = None - if self.cemb is not None and self.wemb is not None: - if merge_type.lower() == 'mixer': - if self.cemb.embedding_dim != self.wemb.embedding_dim: - raise ValueError("EmbeddingMixer needs equal embedding dims") - self.merger = EmbeddingMixer(wemb_dim) - in_dim = wemb_dim - elif merge_type == 'concat': - self.merger = EmbeddingConcat() - in_dim = wemb_dim + self.cemb.embedding_dim - else: - raise ValueError("Unknown merge method: {}".format(merge_type)) - elif self.cemb is None: - in_dim = wemb_dim - else: - in_dim = self.cemb.embedding_dim + (self.wemb, self.cemb, self.merger), in_dim = build_embeddings( + label_encoder, wemb_dim, + cemb_dim, cemb_type, custom_cemb_cell, cemb_layers, cell, init_rnn, + merge_type, dropout) # Encoder self.encoder = None @@ -185,7 +154,8 @@ def __init__(self, label_encoder, tasks, wemb_dim, cemb_dim, hidden_size, num_la self.lm_bwd_decoder = LinearDecoder(label_encoder.word, hidden_size) def get_args_and_kwargs(self): - return {'args': (self.wemb_dim, self.cemb_dim, self.hidden_size, self.num_layers), + return {'args': (self.wemb_dim, self.cemb_dim, + self.hidden_size, self.num_layers), 'kwargs': {'dropout': self.dropout, 'word_dropout': self.word_dropout, 'cell': self.cell, @@ -208,6 +178,13 @@ def embedding(self, word, wlen, char, clen): # cemb_outs: (seq_len x batch x emb_dim) cemb, cemb_outs = self.cemb(char, clen, wlen) + if wemb is None: + emb = cemb + elif cemb is None: + emb = wemb + else: + emb = self.merger(wemb, cemb) + return wemb, cemb, cemb_outs def init_from_encoder(self, encoder): @@ -241,12 +218,6 @@ def loss(self, batch_data, *target_tasks): # Embedding wemb, cemb, cemb_outs = self.embedding(word, wlen, char, clen) - if wemb is None: - emb = cemb - elif cemb is None: - emb = wemb - else: - emb = self.merger(wemb, cemb) # Encoder emb = F.dropout(emb, p=self.dropout, training=self.training) @@ -313,12 +284,6 @@ def predict(self, inp, *tasks, use_beam=False, beam_width=10, **kwargs): # Embedding wemb, cemb, cemb_outs = self.embedding(word, wlen, char, clen) - if wemb is None: - emb = cemb - elif cemb is None: - emb = wemb - else: - emb = self.merger(wemb, cemb) # Encoder enc_outs = None From 814726da04c7ebe46f7eb784ad72bb7f0f2f303e Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Sun, 26 Apr 2020 18:51:56 +0200 Subject: [PATCH 34/46] fixing LSTM :-s --- pie/models/decoder.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pie/models/decoder.py b/pie/models/decoder.py index 379a4b5..55b6cb1 100644 --- a/pie/models/decoder.py +++ b/pie/models/decoder.py @@ -413,7 +413,7 @@ def predict_beam(self, enc_outs, lengths, # expose beam dim for swaping if isinstance(hidden, tuple): hidden = hidden[0].view(self.num_layers, width, batch, -1), \ - hidden[0].view(self.num_layers, width, batch, -1) + hidden[1].view(self.num_layers, width, batch, -1) else: hidden = hidden.view(self.num_layers, width, batch, -1) @@ -432,7 +432,11 @@ def predict_beam(self, enc_outs, lengths, hidden[:, :, i].copy_(hidden[:, :, i].index_select(1, sbeam)) # collapse beam and batch - hidden = hidden.view(self.num_layers, width * batch, -1) + if isinstance(hidden, tuple): + hidden = hidden[0].view(self.num_layers, width * batch, -1), \ + hidden[1].view(self.num_layers, width * batch, -1) + else: + hidden = hidden.view(self.num_layers, width * batch, -1) scores, hyps = [], [] for beam in beams: From b797646bcecb2a48e37e95449078bac98b43e122 Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Sun, 26 Apr 2020 18:52:25 +0200 Subject: [PATCH 35/46] deprecation fix --- pie/models/embedding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pie/models/embedding.py b/pie/models/embedding.py index 535e170..d077772 100644 --- a/pie/models/embedding.py +++ b/pie/models/embedding.py @@ -149,9 +149,9 @@ def forward(self, wembs, cembs): alpha_in = torch.cat([wembs, cembs], dim=-1) # ((seq_len x) batch) if wembs.dim() == 3: - alpha = F.sigmoid(torch.einsum('do,mbd->mb', [self.alpha, alpha_in])) + alpha = torch.sigmoid(torch.einsum('do,mbd->mb', [self.alpha, alpha_in])) else: - alpha = F.sigmoid(torch.einsum('do,bd->b', [self.alpha, alpha_in])) + alpha = torch.sigmoid(torch.einsum('do,bd->b', [self.alpha, alpha_in])) wembs = alpha.unsqueeze(-1).expand_as(wembs) * wembs cembs = (1 - alpha).unsqueeze(-1).expand_as(cembs) * cembs From e7f9be9064579507f3a0c3689342c4eba8a27efd Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Sun, 26 Apr 2020 18:52:44 +0200 Subject: [PATCH 36/46] forgotten option --- pie/default_settings.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pie/default_settings.json b/pie/default_settings.json index 198bf4b..232b095 100644 --- a/pie/default_settings.json +++ b/pie/default_settings.json @@ -136,5 +136,6 @@ "linear_layers": 1, // number of layers for linear decoders "hidden_size": 300, // sentence encoding dimension "num_layers": 1, // num recurrent layers for the sentence encoder - "cell": "LSTM" // cell type for rnns + "cell": "LSTM", // cell type for rnns + "init_rnn": "default" // initializing RNNs (default, xavier_uniform, orthogonal) } From f821750693bafd00d47b98909bcee3cc3e5faaa5 Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Sun, 26 Apr 2020 18:53:46 +0200 Subject: [PATCH 37/46] fixes to abstracted out embeddings --- pie/models/__init__.py | 1 + pie/models/embedding.py | 9 +++------ pie/models/model.py | 6 +++--- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/pie/models/__init__.py b/pie/models/__init__.py index 2ad955c..271b934 100644 --- a/pie/models/__init__.py +++ b/pie/models/__init__.py @@ -3,6 +3,7 @@ from .model import SimpleModel from .encoder import RNNEncoder from .embedding import CNNEmbedding, RNNEmbedding, EmbeddingConcat, EmbeddingMixer +from .embedding import build_embeddings from .decoder import LinearDecoder, AttentionalDecoder, CRFDecoder from .loaders import get_pretrained_embeddings from .scorer import Scorer, compute_scores diff --git a/pie/models/embedding.py b/pie/models/embedding.py index d077772..81c4471 100644 --- a/pie/models/embedding.py +++ b/pie/models/embedding.py @@ -200,17 +200,14 @@ def build_embeddings(label_encoder, wemb_dim, in_dim = wemb_dim + cemb.embedding_dim else: raise ValueError("Unknown merge method: {}".format(merge_type)) - elif cemb is None: - in_dim = wemb_dim - else: + elif cemb is not None: in_dim = cemb.embedding_dim + else: + in_dim = wemb_dim return (wemb, cemb, merger), in_dim -def get_embeddings() - - if __name__ == '__main__': from pie.settings import settings_from_file from pie.data import Dataset diff --git a/pie/models/model.py b/pie/models/model.py index 6c8a54b..1a53a30 100644 --- a/pie/models/model.py +++ b/pie/models/model.py @@ -185,7 +185,7 @@ def embedding(self, word, wlen, char, clen): else: emb = self.merger(wemb, cemb) - return wemb, cemb, cemb_outs + return emb, (wemb, cemb, cemb_outs) def init_from_encoder(self, encoder): # wemb @@ -217,7 +217,7 @@ def loss(self, batch_data, *target_tasks): output = {} # Embedding - wemb, cemb, cemb_outs = self.embedding(word, wlen, char, clen) + emb, (wemb, cemb, cemb_outs) = self.embedding(word, wlen, char, clen) # Encoder emb = F.dropout(emb, p=self.dropout, training=self.training) @@ -283,7 +283,7 @@ def predict(self, inp, *tasks, use_beam=False, beam_width=10, **kwargs): (word, wlen), (char, clen) = inp # Embedding - wemb, cemb, cemb_outs = self.embedding(word, wlen, char, clen) + emb, (wemb, cemb, cemb_outs) = self.embedding(word, wlen, char, clen) # Encoder enc_outs = None From d3a8cdb6cd3b011413d05d40b0f98e340b479103 Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Sun, 26 Apr 2020 18:54:05 +0200 Subject: [PATCH 38/46] more deprecation fixes --- pie/models/highway.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pie/models/highway.py b/pie/models/highway.py index e74db14..03ecc3a 100644 --- a/pie/models/highway.py +++ b/pie/models/highway.py @@ -31,7 +31,7 @@ def forward(self, inp): current = inp for layer in self.layers: inp, gate = layer(current).chunk(2, dim=-1) - inp, gate = getattr(F, self.act)(inp), F.sigmoid(gate) + inp, gate = getattr(F, self.act)(inp), torch.sigmoid(gate) current = gate * current + (1 - gate) * inp return current From 63d60913131f19da95a5910a376104a8446c8c25 Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Tue, 28 Apr 2020 13:48:13 +0200 Subject: [PATCH 39/46] formatting --- pie/models/scorer.py | 9 ++++----- pie/optimize.py | 12 ++++++++++++ pie/scripts/evaluate.py | 4 ++-- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/pie/models/scorer.py b/pie/models/scorer.py index f8c7eb4..5274491 100644 --- a/pie/models/scorer.py +++ b/pie/models/scorer.py @@ -358,11 +358,10 @@ def classification_report(y_true, y_pred, digits=2): p, r, f1, s = precision_recall_fscore_support(y_true, y_pred, average=None) - tbl_rows = list(zip( - target_names, - *[map(lambda x: floatfmt.format(x), nb_list.tolist()) - for nb_list in [p, r, f1]], - *[list(map(str, s.tolist()))])) + formatted = [] + for nb_list in [p, r, f1]: + formatted.append([floatfmt.format(x) for x in nb_list.tolist()]) + support = [str(x) for x in s.tolist()] # compute averages last_row = (last_line_heading, diff --git a/pie/optimize.py b/pie/optimize.py index cbb86ea..bcf2c1d 100644 --- a/pie/optimize.py +++ b/pie/optimize.py @@ -136,3 +136,15 @@ def run_optimize(train_fn, settings, opt, n_iter, **kwargs): print("::: Sampled settings :::") print(yaml.dump(dict(merged))) train_fn(check_settings(merge_task_defaults(merged)), **kwargs) + + +if __name__ == '__main__': + from pie.settings import settings_from_file + settings = settings_from_file("./transformer-lemma.json") + opt = read_opt("opt-transformer.json") + for _ in range(10): + sampled = sample_from_config(opt) + d = Settings(utils.recursive_merge(dict(settings), sampled, overwrite=True)) + for k in opt: + print(k, d[k]) + print() diff --git a/pie/scripts/evaluate.py b/pie/scripts/evaluate.py index 7b3d9fb..fa6aab1 100644 --- a/pie/scripts/evaluate.py +++ b/pie/scripts/evaluate.py @@ -54,8 +54,8 @@ def run(model_path, test_path, train_path, # data testset = Dataset(settings, Reader(settings, *test_path), model.label_encoder) - for task in model.evaluate( - testset, trainset, use_beam=use_beam, beam_width=beam_width).values(): + for task in model.evaluate(testset, trainset, + use_beam=use_beam, beam_width=beam_width).values(): task.print_summary( full=full, confusion_matrix=confusion, report=report, markdown=markdown) From 7575ed6b3241bc19276224dfe9d0ac086c95ed7c Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Tue, 28 Apr 2020 14:11:57 +0200 Subject: [PATCH 40/46] added transformer.py --- pie/models/transformer.py | 388 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 388 insertions(+) create mode 100644 pie/models/transformer.py diff --git a/pie/models/transformer.py b/pie/models/transformer.py new file mode 100644 index 0000000..f82ecc4 --- /dev/null +++ b/pie/models/transformer.py @@ -0,0 +1,388 @@ + +import random +import logging +from datetime import datetime + +import numpy +import torch +import torch.nn as nn +import torch.nn.functional as F + +from transformers import AutoModel, AutoTokenizer + +import pie +from pie import torch_utils +from pie.settings import settings_from_file +from pie.trainer import Trainer +from pie import initialization +from pie.data import Dataset, Reader, MultiLabelEncoder +from pie.data.dataset import pack_batch +from pie.models import (BaseModel, LinearDecoder, CRFDecoder, + build_embeddings, AttentionalDecoder) +from pie import optimize + + +def get_tokenizer_breakpoints(tokenizer): + breaks = [] + for v, idx in tokenizer.get_vocab().items(): + breaks.append(int(not v.startswith('Ġ'))) + breaks = torch.tensor(breaks) + breaks[tokenizer.all_special_ids] = 0 + return breaks + + +def get_span_offsets(input_ids, breaks): + max_span_len = 0 + spans, span = [], 0 + for idx, i in enumerate(breaks[input_ids].tolist()[::-1]): + if i == 0: + start = len(input_ids) - idx - 1 + spans.append((start, start + span + 1)) + max_span_len = max(max_span_len, span + 1) + span = 0 + else: + span += 1 + spans = spans[::-1] + + return spans, max_span_len + + +def get_spans(batch, input_ids, breaks): + span_offsets, max_span_len = zip( + *[get_span_offsets(inp, breaks) for inp in input_ids]) + max_span_len = max(max_span_len) + max_spans = max(map(len, span_offsets)) + batch_size, _, emb_dim = batch.shape + output = torch.zeros( + batch_size, max_spans, max_span_len, emb_dim, device=batch.device) + mask = torch.zeros(batch_size, max_spans, max_span_len) + + for i in range(batch_size): + for span, (start, end) in enumerate(span_offsets[i]): + output[i, span, 0:end-start].copy_(batch[i, start:end]) + mask[i, span, 0:end-start] = 1 + + return output, mask.bool() + + +def check_alignment(tokenizer, text): + input_ids = tokenizer.batch_encode_plus(text)['input_ids'] + breaks = get_tokenizer_breakpoints(tokenizer) + spans, _ = get_span_offsets(input_ids[0], breaks) + tokens = tokenizer.convert_ids_to_tokens(input_ids[0]) + for start, end in spans: + print(tokens[start:end]) + + +class TransformerDataset(Dataset): + def __init__(self, settings, reader, label_encoder, tokenizer, model): + super().__init__(settings, reader, label_encoder) + + self.tokenizer = tokenizer + self.model = model + self.breaks = get_tokenizer_breakpoints(tokenizer).to(settings.device) + + def get_transformer_output(self, text, device): + encoded = self.tokenizer.batch_encode_plus( + text, return_tensors='pt', pad_to_max_length=True) + encoded = {k: val.to(self.model.device) for k, val in encoded.items()} + with torch.no_grad(): + batch, _ = self.model(**encoded) + input_ids = self.tokenizer.batch_encode_plus(text)['input_ids'] + # remove , tokens + batch = batch[:, 1:-1] + input_ids = [inp[1:-1] for inp in input_ids] + # get spans + context, mask = get_spans(batch, input_ids, self.breaks) + context, mask = context.to(device), mask.to(device) + + return context, mask + + def pack_batch(self, batch, device=None): + device = device or self.device + (word, char), tasks = pack_batch(self.label_encoder, batch, device) + context, mask = self.get_transformer_output( + [' '.join(inp) for inp, _ in batch], device) + + return (word, char, (context, mask)), tasks + + +class SpanSelfAttention(nn.Module): + def __init__(self, context_dim, hidden_size, dropout=0.0): + self.context_dim = context_dim + self.hidden_size = hidden_size + self.dropout = dropout + super().__init__() + + self.W = nn.Linear(context_dim, hidden_size) + self.v_a = nn.Parameter(torch.Tensor(hidden_size, 1)) + self.init() + + def init(self): + self.v_a.data.uniform_(-0.05, 0.05) + nn.init.xavier_uniform_(self.W.weight) + + def forward(self, context, mask): + # (batch, num_spans, max_span_len, 1) + weights = self.W(context) @ self.v_a.unsqueeze(0).unsqueeze(0) + weights = weights.squeeze(3) + # apply mask + weights.masked_fill_(~mask, -float('inf')) + # softmax + weights = F.softmax(weights, dim=-1) + # remove nans that arise in padding + weights = torch.where(torch.isnan(weights), torch.zeros_like(weights), weights) + # weighted sum (batch, num_spans, max_span_len, dim) -> (batch, num_spans, dim) + context = (context * weights.unsqueeze(-1)).sum(2) + context = F.dropout(context, p=self.dropout, training=self.training) + # transpose to batch-second + context = context.transpose(0, 1) + return context + + +class Model(BaseModel): + def __init__(self, label_encoder, tasks, context_dim, + # input embeddings + wemb_dim=0, cemb_dim=0, cemb_type='RNN', custom_cemb_cell=False, + cemb_layers=1, cell='GRU', init_rnn='default', merge_type='concat', + # decoder + linear_layers=1, dropout=0.0, scorer='general'): + self.context_dim = context_dim + self.linear_layers = linear_layers + self.dropout = dropout + super().__init__(label_encoder, tasks) + + hidden_size = context_dim + + # embeddings + (self.wemb, self.cemb, self.merger), in_dim = build_embeddings( + label_encoder, wemb_dim, + cemb_dim, cemb_type, custom_cemb_cell, cemb_layers, cell, init_rnn, + merge_type, dropout) + + # self attention + self.self_att = SpanSelfAttention( + context_dim, hidden_size, dropout=dropout) + + # decoders + decoders = {} + for tname, task in self.tasks.items(): + + if task['level'].lower() == 'char': + if task['decoder'].lower() == 'attentional': + decoder = AttentionalDecoder( + label_encoder.tasks[tname], cemb_dim, self.cemb.embedding_dim, + context_dim=hidden_size + in_dim, scorer=scorer, + num_layers=cemb_layers, cell=cell, dropout=dropout, + init_rnn=init_rnn) + + elif task['level'].lower() == 'token': + # linear + if task['decoder'].lower() == 'linear': + decoder = LinearDecoder( + label_encoder.tasks[tname], hidden_size + in_dim, + highway_layers=linear_layers - 1) + # crf + elif task['decoder'].lower() == 'crf': + decoder = CRFDecoder( + label_encoder.tasks[tname], hidden_size + in_dim, + highway_layers=linear_layers - 1) + + else: + raise ValueError( + "Unknown decoder type {} for token-level task: {}".format( + task['decoder'], tname)) + + self.add_module('{}_decoder'.format(tname), decoder) + decoders[tname] = decoder + + self.decoders = decoders + + def get_args_and_kwargs(self): + return {'args': (self.context_dim, ), + 'kwargs': {'linear_layers': self.linear_layers}} + + def embedding(self, word, wlen, char, clen): + wemb, cemb, cemb_outs = None, None, None + if self.wemb is not None: + # set words to unknown with prob `p` depending on word frequency + word = torch_utils.word_dropout( + word, self.word_dropout, self.training, self.label_encoder.word) + wemb = self.wemb(word) + if self.cemb is not None: + # cemb_outs: (seq_len x batch x emb_dim) + cemb, cemb_outs = self.cemb(char, clen, wlen) + + if wemb is None: + emb = cemb + elif cemb is None: + emb = wemb + elif self.merger is not None: + emb = self.merger(wemb, cemb) + else: + emb = None + + return emb, (wemb, cemb, cemb_outs) + + def loss(self, batch_data, *target_tasks): + ((word, wlen), (char, clen), (context, mask)), tasks = batch_data + output = {} + + emb, (_, _, cemb_outs) = self.embedding(word, wlen, char, clen) + + outs = self.self_att(context, mask) + if emb is not None: + outs = torch.cat([outs, emb], dim=-1) + + for task in target_tasks: + (target, length), decoder = tasks[task], self.decoders[task] + + if self.tasks[task]['level'].lower() == 'char': + cemb_outs = F.dropout( + cemb_outs, p=self.dropout, training=self.training) + logits = decoder(target, length, cemb_outs, clen, + context=torch_utils.flatten_padded_batch(outs, wlen)) + output[task] = decoder.loss(logits, target) + else: + if isinstance(decoder, LinearDecoder): + logits = decoder(outs) + output[task] = decoder.loss(logits, target) + elif isinstance(decoder, CRFDecoder): + logits = decoder(outs) + output[task] = decoder.loss(logits, target, length) + + return output + + def predict(self, inp, *tasks, use_beam=False, beam_width=10, **kwargs): + tasks = set(self.label_encoder.tasks if not len(tasks) else tasks) + (word, wlen), (char, clen), (context, mask) = inp + + emb, (_, _, cemb_outs) = self.embedding(word, wlen, char, clen) + + outs = self.self_att(context, mask) + if emb is not None: + outs = torch.cat([outs, emb], dim=-1) + + preds = {} + for task in tasks: + decoder = self.decoders[task] + + if self.label_encoder.tasks[task].level.lower() == 'char': + if not use_beam: + hyps, _ = decoder.predict_max( + cemb_outs, clen, + context=torch_utils.flatten_padded_batch(outs, wlen)) + else: + hyps, _ = decoder.predict_beam( + cemb_outs, clen, + context=torch_utils.flatten_padded_batch(outs, wlen), + width=beam_width) + if self.label_encoder.tasks[task].preprocessor_fn is None: + hyps = [''.join(hyp) for hyp in hyps] + else: + if isinstance(decoder, LinearDecoder): + hyps, _ = decoder.predict(outs, wlen) + elif isinstance(decoder, CRFDecoder): + hyps, _ = decoder.predict(outs, wlen) + else: + raise ValueError() + + preds[task] = hyps + + return preds + + +def run(settings, transformer_path): + now = datetime.now() + seed = now.hour * 10000 + now.minute * 100 + now.second + print("Using seed:", seed) + random.seed(seed) + numpy.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + + reader = Reader(settings, settings.input_path) + tasks = reader.check_tasks(expected=None) + + label_encoder = MultiLabelEncoder.from_settings(settings, tasks=tasks) + label_encoder.fit_reader(reader) + if settings.verbose: + print("::: Tasks :::") + print() + for task, le in label_encoder.tasks.items(): + print("- {:<15} target={:<6} level={:<6} vocab={:<6}" + .format(task, le.target, le.level, len(le))) + print() + + tokenizer = AutoTokenizer.from_pretrained(transformer_path) + transformer = AutoModel.from_pretrained(transformer_path) + trainset = TransformerDataset( + settings, reader, label_encoder, tokenizer, transformer) + devset = None + if settings.dev_path: + devset = TransformerDataset( + settings, Reader(settings, settings.dev_path), label_encoder, + tokenizer, transformer) + else: + logging.warning("No devset: cannot monitor/optimize training") + + model = Model(label_encoder, settings.tasks, trainset.model.config.hidden_size, + wemb_dim=settings.wemb_dim, cemb_dim=settings.cemb_dim, + cemb_type=settings.cemb_type, + custom_cemb_cell=settings.custom_cemb_cell, + cemb_layers=settings.cemb_layers, cell=settings.cell, + init_rnn=settings.init_rnn, merge_type=settings.merge_type, + linear_layers=settings.linear_layers, dropout=settings.dropout, + scorer=settings.scorer) + model.to(settings.device) + + print("::: Model :::") + print() + print(model) + print() + print("::: Model parameters :::") + print() + trainable = sum(p.nelement() for p in model.parameters() if p.requires_grad) + total = sum(p.nelement() for p in model.parameters()) + print("{}/{} trainable/total".format(trainable, total)) + print() + + + # training + print("Starting training") + trainer = Trainer(settings, model, trainset, reader.get_nsents()) + scores = None + try: + scores = trainer.train_epochs(settings.epochs, devset=devset) + except KeyboardInterrupt: + print("Stopping training") + finally: + model.eval() + + if devset is not None: + scorers = model.evaluate( + devset, trainset=trainset, use_beam=True, beam_width=10) + for task, scorer in scorers.items(): + print(task) + scorer.print_summary() + print() + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('config_path') + parser.add_argument('transformer_path') + parser.add_argument('--opt_path', help='Path to optimization file (see opt.json)') + parser.add_argument('--n_iter', type=int, default=20) + args = parser.parse_args() + + settings = settings_from_file(args.config_path) + + if args.opt_path: + opt = optimize.read_opt(args.opt_path) + optimize.run_optimize( + run, settings, opt, args.n_iter, transformer_path=args.transformer_path) + else: + run(settings, args.transformer_path) From ccb93c4d0d40fa54cf8ce63331cdfee8508bd0f2 Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Tue, 28 Apr 2020 14:11:57 +0200 Subject: [PATCH 41/46] added transformer.py From 23b23e077d1176452775848119524ba38310b243 Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Wed, 29 Apr 2020 15:44:33 +0200 Subject: [PATCH 42/46] make transformers work for all tokenization types --- pie/models/transformer.py | 70 +++++++++++++++------------------------ 1 file changed, 27 insertions(+), 43 deletions(-) diff --git a/pie/models/transformer.py b/pie/models/transformer.py index f82ecc4..51c5efa 100644 --- a/pie/models/transformer.py +++ b/pie/models/transformer.py @@ -22,43 +22,29 @@ from pie import optimize -def get_tokenizer_breakpoints(tokenizer): - breaks = [] - for v, idx in tokenizer.get_vocab().items(): - breaks.append(int(not v.startswith('Ġ'))) - breaks = torch.tensor(breaks) - breaks[tokenizer.all_special_ids] = 0 - return breaks - - -def get_span_offsets(input_ids, breaks): - max_span_len = 0 - spans, span = [], 0 - for idx, i in enumerate(breaks[input_ids].tolist()[::-1]): - if i == 0: - start = len(input_ids) - idx - 1 - spans.append((start, start + span + 1)) - max_span_len = max(max_span_len, span + 1) - span = 0 - else: - span += 1 - spans = spans[::-1] - - return spans, max_span_len - - -def get_spans(batch, input_ids, breaks): - span_offsets, max_span_len = zip( - *[get_span_offsets(inp, breaks) for inp in input_ids]) - max_span_len = max(max_span_len) - max_spans = max(map(len, span_offsets)) +def get_instance_spans(tokenizer, text): + index = [] + tokens = [] + for (i, token) in enumerate(text.split()): + index.append(len(tokens)) + for sub_token in tokenizer.tokenize(token): + tokens.append(sub_token) + index.append(len(tokens)) + spans = list(zip(index[:-1], index[1:])) + return spans + + +def get_spans(tokenizer, texts, batch): + spans = [get_instance_spans(tokenizer, inp) for inp in texts] + max_span_len = max(end - start for sent in spans for start, end in sent) + max_spans = max(map(len, spans)) batch_size, _, emb_dim = batch.shape output = torch.zeros( batch_size, max_spans, max_span_len, emb_dim, device=batch.device) mask = torch.zeros(batch_size, max_spans, max_span_len) for i in range(batch_size): - for span, (start, end) in enumerate(span_offsets[i]): + for span, (start, end) in enumerate(spans[i]): output[i, span, 0:end-start].copy_(batch[i, start:end]) mask[i, span, 0:end-start] = 1 @@ -66,12 +52,14 @@ def get_spans(batch, input_ids, breaks): def check_alignment(tokenizer, text): - input_ids = tokenizer.batch_encode_plus(text)['input_ids'] - breaks = get_tokenizer_breakpoints(tokenizer) - spans, _ = get_span_offsets(input_ids[0], breaks) - tokens = tokenizer.convert_ids_to_tokens(input_ids[0]) - for start, end in spans: - print(tokens[start:end]) + spans = get_instance_spans(tokenizer, text) + orig_tokens = text.split() + assert len(spans) == len(orig_tokens) + tokens = tokenizer.tokenize(text) + output = [] + for idx, (start, end) in enumerate(spans): + output.append((tokens[start:end], orig_tokens[idx])) + return output class TransformerDataset(Dataset): @@ -80,20 +68,17 @@ def __init__(self, settings, reader, label_encoder, tokenizer, model): self.tokenizer = tokenizer self.model = model - self.breaks = get_tokenizer_breakpoints(tokenizer).to(settings.device) def get_transformer_output(self, text, device): encoded = self.tokenizer.batch_encode_plus( text, return_tensors='pt', pad_to_max_length=True) encoded = {k: val.to(self.model.device) for k, val in encoded.items()} with torch.no_grad(): - batch, _ = self.model(**encoded) - input_ids = self.tokenizer.batch_encode_plus(text)['input_ids'] + batch = self.model(**encoded)[0] # some models return 2 items, others 1 # remove , tokens batch = batch[:, 1:-1] - input_ids = [inp[1:-1] for inp in input_ids] # get spans - context, mask = get_spans(batch, input_ids, self.breaks) + context, mask = get_spans(self.tokenizer, text, batch) context, mask = context.to(device), mask.to(device) return context, mask @@ -348,7 +333,6 @@ def run(settings, transformer_path): print("{}/{} trainable/total".format(trainable, total)) print() - # training print("Starting training") trainer = Trainer(settings, model, trainset, reader.get_nsents()) From cf558bbdcfb123bd81cf66071b66c06f5366aabf Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Tue, 12 May 2020 09:41:59 +0200 Subject: [PATCH 43/46] fixed bug --- pie/models/transformer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pie/models/transformer.py b/pie/models/transformer.py index 51c5efa..9efde1c 100644 --- a/pie/models/transformer.py +++ b/pie/models/transformer.py @@ -27,15 +27,15 @@ def get_instance_spans(tokenizer, text): tokens = [] for (i, token) in enumerate(text.split()): index.append(len(tokens)) - for sub_token in tokenizer.tokenize(token): + for sub_token in tokenizer.tokenize(token, add_prefix_space=True): tokens.append(sub_token) index.append(len(tokens)) spans = list(zip(index[:-1], index[1:])) return spans -def get_spans(tokenizer, texts, batch): - spans = [get_instance_spans(tokenizer, inp) for inp in texts] +def get_spans(tokenizer, text, batch): + spans = [get_instance_spans(tokenizer, inp) for inp in text] max_span_len = max(end - start for sent in spans for start, end in sent) max_spans = max(map(len, spans)) batch_size, _, emb_dim = batch.shape From a691072d2dc56ada89ea1d473a1b6267694c32f3 Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Tue, 12 May 2020 09:42:41 +0200 Subject: [PATCH 44/46] fixed serialization issue --- pie/models/base_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pie/models/base_model.py b/pie/models/base_model.py index c201d50..802997f 100644 --- a/pie/models/base_model.py +++ b/pie/models/base_model.py @@ -118,7 +118,7 @@ def save(self, fpath, infix=None, settings=None): # create dir if necessary dirname = os.path.dirname(fpath) - if not os.path.isdir(dirname): + if dirname and not os.path.isdir(dirname): os.makedirs(dirname) with tarfile.open(fpath, 'w') as tar: From 66264792b61d0f16d99af0e5a578bd4c9057afb1 Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Tue, 12 May 2020 09:44:24 +0200 Subject: [PATCH 45/46] made transformer model serializable, rearrange scripts --- pie/models/__init__.py | 1 + pie/models/transformer.py | 149 +++++++++++-------------------------- pie/scripts/train.py | 14 +--- pie/scripts/transformer.py | 137 ++++++++++++++++++++++++++++++++++ pie/settings.py | 13 ++++ 5 files changed, 194 insertions(+), 120 deletions(-) create mode 100644 pie/scripts/transformer.py diff --git a/pie/models/__init__.py b/pie/models/__init__.py index 271b934..1e0d223 100644 --- a/pie/models/__init__.py +++ b/pie/models/__init__.py @@ -1,6 +1,7 @@ from .base_model import BaseModel from .model import SimpleModel +from .transformer import TransformerDataset, TransformerModel from .encoder import RNNEncoder from .embedding import CNNEmbedding, RNNEmbedding, EmbeddingConcat, EmbeddingMixer from .embedding import build_embeddings diff --git a/pie/models/transformer.py b/pie/models/transformer.py index 9efde1c..80d0c3f 100644 --- a/pie/models/transformer.py +++ b/pie/models/transformer.py @@ -1,25 +1,17 @@ -import random import logging from datetime import datetime -import numpy import torch import torch.nn as nn import torch.nn.functional as F -from transformers import AutoModel, AutoTokenizer - -import pie from pie import torch_utils -from pie.settings import settings_from_file -from pie.trainer import Trainer -from pie import initialization -from pie.data import Dataset, Reader, MultiLabelEncoder +from pie.data import Dataset from pie.data.dataset import pack_batch -from pie.models import (BaseModel, LinearDecoder, CRFDecoder, - build_embeddings, AttentionalDecoder) -from pie import optimize +from .base_model import BaseModel +from .decoder import (LinearDecoder, CRFDecoder, AttentionalDecoder) +from .embedding import build_embeddings def get_instance_spans(tokenizer, text): @@ -125,7 +117,7 @@ def forward(self, context, mask): return context -class Model(BaseModel): +class TransformerModel(BaseModel): def __init__(self, label_encoder, tasks, context_dim, # input embeddings wemb_dim=0, cemb_dim=0, cemb_type='RNN', custom_cemb_cell=False, @@ -134,6 +126,14 @@ def __init__(self, label_encoder, tasks, context_dim, linear_layers=1, dropout=0.0, scorer='general'): self.context_dim = context_dim self.linear_layers = linear_layers + self.wemb_dim = wemb_dim + self.cemb_dim = cemb_dim + self.cemb_type = cemb_type + self.custom_cemb_cell = custom_cemb_cell + self.cemb_layers = cemb_layers + self.cell = cell + self.merge_type = merge_type + self.scorer = scorer self.dropout = dropout super().__init__(label_encoder, tasks) @@ -185,7 +185,12 @@ def __init__(self, label_encoder, tasks, context_dim, def get_args_and_kwargs(self): return {'args': (self.context_dim, ), - 'kwargs': {'linear_layers': self.linear_layers}} + 'kwargs': {'linear_layers': self.linear_layers, + "wemb_dim": self.wemb_dim, "cemb_dim": self.cemb_dim, + "cemb_type": self.cemb_type, + "custom_cemb_cell": self.custom_cemb_cell, + "cemb_layers": self.cemb_layers, "cell": self.cell, + "merge_type": self.merge_type, "scorer": self.scorer}} def embedding(self, word, wlen, char, clen): wemb, cemb, cemb_outs = None, None, None @@ -277,96 +282,26 @@ def predict(self, inp, *tasks, use_beam=False, beam_width=10, **kwargs): return preds -def run(settings, transformer_path): - now = datetime.now() - seed = now.hour * 10000 + now.minute * 100 + now.second - print("Using seed:", seed) - random.seed(seed) - numpy.random.seed(seed) - torch.manual_seed(seed) - if torch.cuda.is_available(): - torch.cuda.manual_seed(seed) - - reader = Reader(settings, settings.input_path) - tasks = reader.check_tasks(expected=None) - - label_encoder = MultiLabelEncoder.from_settings(settings, tasks=tasks) - label_encoder.fit_reader(reader) - if settings.verbose: - print("::: Tasks :::") - print() - for task, le in label_encoder.tasks.items(): - print("- {:<15} target={:<6} level={:<6} vocab={:<6}" - .format(task, le.target, le.level, len(le))) - print() - - tokenizer = AutoTokenizer.from_pretrained(transformer_path) - transformer = AutoModel.from_pretrained(transformer_path) - trainset = TransformerDataset( - settings, reader, label_encoder, tokenizer, transformer) - devset = None - if settings.dev_path: - devset = TransformerDataset( - settings, Reader(settings, settings.dev_path), label_encoder, - tokenizer, transformer) - else: - logging.warning("No devset: cannot monitor/optimize training") - - model = Model(label_encoder, settings.tasks, trainset.model.config.hidden_size, - wemb_dim=settings.wemb_dim, cemb_dim=settings.cemb_dim, - cemb_type=settings.cemb_type, - custom_cemb_cell=settings.custom_cemb_cell, - cemb_layers=settings.cemb_layers, cell=settings.cell, - init_rnn=settings.init_rnn, merge_type=settings.merge_type, - linear_layers=settings.linear_layers, dropout=settings.dropout, - scorer=settings.scorer) - model.to(settings.device) - - print("::: Model :::") - print() - print(model) - print() - print("::: Model parameters :::") - print() - trainable = sum(p.nelement() for p in model.parameters() if p.requires_grad) - total = sum(p.nelement() for p in model.parameters()) - print("{}/{} trainable/total".format(trainable, total)) - print() - - # training - print("Starting training") - trainer = Trainer(settings, model, trainset, reader.get_nsents()) - scores = None - try: - scores = trainer.train_epochs(settings.epochs, devset=devset) - except KeyboardInterrupt: - print("Stopping training") - finally: - model.eval() - - if devset is not None: - scorers = model.evaluate( - devset, trainset=trainset, use_beam=True, beam_width=10) - for task, scorer in scorers.items(): - print(task) - scorer.print_summary() - print() - - -if __name__ == '__main__': - import argparse - parser = argparse.ArgumentParser() - parser.add_argument('config_path') - parser.add_argument('transformer_path') - parser.add_argument('--opt_path', help='Path to optimization file (see opt.json)') - parser.add_argument('--n_iter', type=int, default=20) - args = parser.parse_args() - - settings = settings_from_file(args.config_path) - - if args.opt_path: - opt = optimize.read_opt(args.opt_path) - optimize.run_optimize( - run, settings, opt, args.n_iter, transformer_path=args.transformer_path) - else: - run(settings, args.transformer_path) +# transformer_path = '../latin-data/latin-model/v4/checkpoint-110000/' +# from transformers import AutoModel, AutoTokenizer +# tokenizer = AutoTokenizer.from_pretrained(transformer_path) +# model = AutoModel.from_pretrained(transformer_path) +# from pie.settings import settings_from_file +# settings = settings_from_file('transformer-lemma.json') +# from pie.data import Reader, MultiLabelEncoder +# reader = Reader(settings, settings.input_path) +# label_encoder = MultiLabelEncoder.from_settings(settings).fit_reader(reader) +# r = reader.readsents() +# sents = [] +# for _ in range(10): +# _, (inp, tasks) = next(r) +# sents.append(inp) +# text = [' '.join(s) for s in sents] +# encoded = tokenizer.batch_encode_plus( +# text, return_tensors='pt', pad_to_max_length=True) +# encoded = {k: val.to(model.device) for k, val in encoded.items()} +# with torch.no_grad(): +# batch = model(**encoded)[0] +# # some models return 2 items, others 1 +# get_instance_spans(tokenizer, text[0]) +# get_spans(tokenizer, text, batch) diff --git a/pie/scripts/train.py b/pie/scripts/train.py index d491c75..a7306db 100644 --- a/pie/scripts/train.py +++ b/pie/scripts/train.py @@ -6,7 +6,7 @@ import logging import pie -from pie.settings import settings_from_file +from pie.settings import settings_from_file, get_targets, get_fname_infix from pie.trainer import Trainer from pie import initialization from pie.data import Dataset, Reader, MultiLabelEncoder @@ -19,18 +19,6 @@ import torch -def get_targets(settings): - return [task['name'] for task in settings.tasks if task.get('target')] - - -def get_fname_infix(settings): - # fname - fname = os.path.join(settings.modelpath, settings.modelname) - timestamp = datetime.now().strftime("%Y_%m_%d-%H_%M_%S") - infix = '+'.join(get_targets(settings)) + '-' + timestamp - return fname, infix - - def run(settings): now = datetime.now() diff --git a/pie/scripts/transformer.py b/pie/scripts/transformer.py new file mode 100644 index 0000000..4c28d9f --- /dev/null +++ b/pie/scripts/transformer.py @@ -0,0 +1,137 @@ + +import os +from datetime import datetime +import logging +import random + +import numpy +import torch +from transformers import AutoModel, AutoTokenizer + +from pie.settings import settings_from_file, get_targets, get_fname_infix +from pie.trainer import Trainer +from pie import optimize +from pie.data import Reader, MultiLabelEncoder +from pie.models import TransformerDataset, TransformerModel + + +def run(settings, transformer_path): + now = datetime.now() + seed = now.hour * 10000 + now.minute * 100 + now.second + print("Using seed:", seed) + random.seed(seed) + numpy.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + + reader = Reader(settings, settings.input_path) + tasks = reader.check_tasks(expected=None) + + label_encoder = MultiLabelEncoder.from_settings(settings, tasks=tasks) + label_encoder.fit_reader(reader) + if settings.verbose: + print("::: Tasks :::") + print() + for task, le in label_encoder.tasks.items(): + print("- {:<15} target={:<6} level={:<6} vocab={:<6}" + .format(task, le.target, le.level, len(le))) + print() + + tokenizer = AutoTokenizer.from_pretrained(transformer_path) + transformer = AutoModel.from_pretrained(transformer_path) + trainset = TransformerDataset( + settings, reader, label_encoder, tokenizer, transformer) + devset = None + if settings.dev_path: + devset = TransformerDataset( + settings, Reader(settings, settings.dev_path), label_encoder, + tokenizer, transformer) + else: + logging.warning("No devset: cannot monitor/optimize training") + + model = TransformerModel( + label_encoder, settings.tasks, trainset.model.config.hidden_size, + wemb_dim=settings.wemb_dim, cemb_dim=settings.cemb_dim, + cemb_type=settings.cemb_type, + custom_cemb_cell=settings.custom_cemb_cell, + cemb_layers=settings.cemb_layers, cell=settings.cell, + init_rnn=settings.init_rnn, merge_type=settings.merge_type, + linear_layers=settings.linear_layers, dropout=settings.dropout, + scorer=settings.scorer) + model.to(settings.device) + + print("::: Model :::") + print() + print(model) + print() + print("::: Model parameters :::") + print() + trainable = sum(p.nelement() for p in model.parameters() if p.requires_grad) + total = sum(p.nelement() for p in model.parameters()) + print("{}/{} trainable/total".format(trainable, total)) + print() + + # training + print("Starting training") + trainer = Trainer(settings, model, trainset, reader.get_nsents()) + scores = None + try: + scores = trainer.train_epochs(settings.epochs, devset=devset) + except KeyboardInterrupt: + print("Stopping training") + finally: + model.eval() + + if devset is not None: + scorers = model.evaluate( + devset, trainset=trainset, use_beam=True, beam_width=10) + for task, scorer in scorers.items(): + print(task) + scorer.print_summary() + print() + + fpath, infix = get_fname_infix(settings) + if not settings.run_test: + settings['transformer_path'] = os.path.join( + os.getcwd(), transformer_path) + fpath = model.save(fpath, infix=infix, settings=settings) + print("Saved best model to: [{}]".format(fpath)) + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('config_path') + parser.add_argument('transformer_path') + parser.add_argument('--opt_path', help='Path to optimization file (see opt.json)') + parser.add_argument('--n_iter', type=int, default=20) + args = parser.parse_args() + + settings = settings_from_file(args.config_path) + + if args.opt_path: + opt = optimize.read_opt(args.opt_path) + optimize.run_optimize( + run, settings, opt, args.n_iter, transformer_path=args.transformer_path) + else: + run(settings, args.transformer_path) + + +# settings = settings_from_file('../pie/transformer-lemma.json') +# reader = Reader(settings, '../pie/datasets/capitula_classic_split/train0.train.tsv') +# label_encoder = MultiLabelEncoder.from_settings(settings).fit_reader(reader) +# trans_path = '../latin-data/latin-model/v4/checkpoint-110000' +# tokenizer = AutoTokenizer.from_pretrained(trans_path) +# transformer = AutoModel.from_pretrained(trans_path) +# trainset = pie.models.TransformerDataset( +# settings, reader, label_encoder, tokenizer, transformer) +# model = pie.models.TransformerModel( +# label_encoder, settings.tasks, trainset.model.config.hidden_size, +# wemb_dim=settings.wemb_dim, cemb_dim=settings.cemb_dim, +# cemb_type=settings.cemb_type, +# custom_cemb_cell=settings.custom_cemb_cell, +# cemb_layers=settings.cemb_layers, cell=settings.cell, +# init_rnn=settings.init_rnn, merge_type=settings.merge_type, +# linear_layers=settings.linear_layers, dropout=settings.dropout, +# scorer=settings.scorer) diff --git a/pie/settings.py b/pie/settings.py index d2a266a..78d58a1 100644 --- a/pie/settings.py +++ b/pie/settings.py @@ -1,5 +1,6 @@ import os +from datetime import datetime import yaml import json from json_minify import json_minify @@ -148,3 +149,15 @@ def settings_from_file(config_path): print(yaml.dump(dict(settings))) return check_settings(merge_task_defaults(settings)) + + +def get_targets(settings): + return [task['name'] for task in settings.tasks if task.get('target')] + + +def get_fname_infix(settings): + # fname + fname = os.path.join(settings.modelpath, settings.modelname) + timestamp = datetime.now().strftime("%Y_%m_%d-%H_%M_%S") + infix = '+'.join(get_targets(settings)) + '-' + timestamp + return fname, infix From 17d6a03b2d07786382ded3cc28b7f804fee7041a Mon Sep 17 00:00:00 2001 From: Enrique Manjavacas Date: Tue, 12 May 2020 11:57:59 +0200 Subject: [PATCH 46/46] added evaluate option to script --- pie/scripts/transformer.py | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/pie/scripts/transformer.py b/pie/scripts/transformer.py index 4c28d9f..709b120 100644 --- a/pie/scripts/transformer.py +++ b/pie/scripts/transformer.py @@ -106,11 +106,42 @@ def run(settings, transformer_path): parser.add_argument('transformer_path') parser.add_argument('--opt_path', help='Path to optimization file (see opt.json)') parser.add_argument('--n_iter', type=int, default=20) + # eval arguments + parser.add_argument('--run_eval', action='store_true') + parser.add_argument('--model_path', help='only used for evaluation') + parser.add_argument('--test_path', help='only used for evaluation') + parser.add_argument('--batch_size', type=int, default=500) + parser.add_argument('--buffer_size', type=int, default=100000) + parser.add_argument('--use_beam', action='store_true') + parser.add_argument('--beam_width', type=int, default=12) + parser.add_argument('--device', default='cpu') args = parser.parse_args() settings = settings_from_file(args.config_path) - if args.opt_path: + if args.run_eval: + model = TransformerModel.load(args.model_path) + m_settings = model._settings + m_settings.device = args.device + m_settings.shuffle = False + m_settings.batch_size = args.batch_size + m_settings.buffer_size = args.buffer_size + + tokenizer = AutoTokenizer.from_pretrained(args.transformer_path) + transformer = AutoModel.from_pretrained(args.transformer_path) + trainset = TransformerDataset( + m_settings, Reader(m_settings, m_settings.input_path), model.label_encoder, + tokenizer, transformer) + testset = TransformerDataset( + m_settings, Reader(m_settings, args.test_path), model.label_encoder, + tokenizer, transformer) + + for task in model.evaluate( + testset, trainset, use_beam=args.use_beam, beam_width=args.beam_width + ).values(): + task.print_summary() + + elif args.opt_path: opt = optimize.read_opt(args.opt_path) optimize.run_optimize( run, settings, opt, args.n_iter, transformer_path=args.transformer_path)