diff --git a/.travis.yml b/.travis.yml index f1c6db54ef..3cacadfc58 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,8 +5,6 @@ env: python: - "3.6" install: - - sudo apt update -y - - sudo apt install mecab libmecab-dev mecab-ipadic - python setup.py develop -q before_script: cd tests script: diff --git a/flair/data.py b/flair/data.py index 9b163a4314..6330207e01 100644 --- a/flair/data.py +++ b/flair/data.py @@ -9,9 +9,6 @@ from collections import Counter from collections import defaultdict -from tiny_tokenizer import WordTokenizer -from tiny_tokenizer import SentenceTokenizer - from segtok.segmenter import split_single from segtok.tokenizer import split_contractions from segtok.tokenizer import word_tokenizer @@ -432,8 +429,23 @@ def build_japanese_tokenizer(tokenizer: str = "MeCab"): if tokenizer.lower() != "mecab": raise NotImplementedError("Currently, MeCab is only supported.") - sentence_tokenizer = SentenceTokenizer() - word_tokenizer = WordTokenizer(tokenizer) + try: + import tiny_tokenizer + except ModuleNotFoundError: + log.warning("-" * 100) + log.warning('ATTENTION! The library "tiny_tokenizer" is not installed!') + log.warning( + 'To use Japanese tokenizer, please first install with the following steps: "pip install allennlp"' + ) + log.warning( + '- Install mecab with "sudo apt install mecab libmecab-dev mecab-ipadic"' + ) + log.warning('- Install tiny_tokenizer with "pip install tiny_tokenizer[all]"') + log.warning("-" * 100) + pass + + sentence_tokenizer = tiny_tokenizer.SentenceTokenizer() + word_tokenizer = tiny_tokenizer.WordTokenizer(tokenizer) def tokenizer(text: str) -> List[Token]: """ @@ -463,7 +475,9 @@ def tokenizer(text: str) -> List[Token]: current_offset + 1 if current_offset > 0 else current_offset ) - token = Token(text=word, start_position=start_position, whitespace_after=True) + token = Token( + text=word, start_position=start_position, whitespace_after=True + ) tokens.append(token) if (previous_token is not None) and word_offset - 1 == previous_word_offset: @@ -507,9 +521,9 @@ def segtok_tokenizer(text: str) -> List[Token]: ) if word: - token = Token(text=word, - start_position=start_position, - whitespace_after=True) + token = Token( + text=word, start_position=start_position, whitespace_after=True + ) tokens.append(token) if (previous_token is not None) and word_offset - 1 == previous_word_offset: @@ -933,7 +947,7 @@ def get_language_code(self) -> str: return self.language_code - def _restore_windows_1252_characters(self, text:str)->str: + def _restore_windows_1252_characters(self, text: str) -> str: def to_windows_1252(match): try: return bytes([ord(match.group(0))]).decode("windows-1252") diff --git a/flair/datasets.py b/flair/datasets.py index 1ab1ae1a79..4302279ddc 100644 --- a/flair/datasets.py +++ b/flair/datasets.py @@ -6,8 +6,6 @@ from pathlib import Path from typing import List, Dict, Union, Callable -import pymongo - import numpy as np import json import urllib @@ -314,24 +312,24 @@ def __init__( class FeideggerCorpus(Corpus): def __init__(self, **kwargs): - dataset = 'feidegger' + dataset = "feidegger" # cache Feidegger config file - json_link = 'https://raw.githubusercontent.com/zalandoresearch/feidegger/master/data/FEIDEGGER_release_1.1.json' - json_local_path = cached_path(json_link, Path('datasets') / dataset) + json_link = "https://raw.githubusercontent.com/zalandoresearch/feidegger/master/data/FEIDEGGER_release_1.1.json" + json_local_path = cached_path(json_link, Path("datasets") / dataset) # cache Feidegger images - dataset_info = json.load(open(json_local_path, 'r')) - images_cache_folder = os.path.join(os.path.dirname(json_local_path), 'images') + dataset_info = json.load(open(json_local_path, "r")) + images_cache_folder = os.path.join(os.path.dirname(json_local_path), "images") if not os.path.isdir(images_cache_folder): os.mkdir(images_cache_folder) for image_info in tqdm(dataset_info): - name = os.path.basename(image_info['url']) + name = os.path.basename(image_info["url"]) filename = os.path.join(images_cache_folder, name) if not os.path.isfile(filename): - urllib.request.urlretrieve(image_info['url'], filename) + urllib.request.urlretrieve(image_info["url"], filename) # replace image URL with local cached file - image_info['url'] = filename + image_info["url"] = filename feidegger_dataset: Dataset = FeideggerDataset(dataset_info, **kwargs) @@ -1184,15 +1182,14 @@ def __getitem__(self, index: int = 0) -> Sentence: class MongoDataset(FlairDataset): - def __init__( self, - query: str = None, - host: str = 'localhost', - port: int = 27017, - database: str = 'rosenberg', - collection: str = 'book', - text_field: str = 'Beskrivning', + query: str, + host: str, + port: int, + database: str, + collection: str, + text_field: str, categories_field: List[str] = None, max_tokens_per_doc: int = -1, max_chars_per_doc: int = -1, @@ -1225,6 +1222,18 @@ def __init__( :return: list of sentences """ + # first, check if pymongo is installed + try: + import pymongo + except ModuleNotFoundError: + log.warning("-" * 100) + log.warning('ATTENTION! The library "pymongo" is not installed!') + log.warning( + 'To use MongoDataset, please first install with "pip install pymongo"' + ) + log.warning("-" * 100) + pass + self.in_memory = in_memory self.tokenizer = tokenizer @@ -1245,24 +1254,20 @@ def __init__( start = 0 - kwargs = lambda start: { - 'filter': query, - 'skip': start, - 'limit': 0 - } + kwargs = lambda start: {"filter": query, "skip": start, "limit": 0} if self.in_memory: for document in self.__cursor.find(**kwargs(start)): sentence = self._parse_document_to_sentence( document[self.text], - [document[_] if _ in document else '' for _ in self.categories], - tokenizer + [document[_] if _ in document else "" for _ in self.categories], + tokenizer, ) if sentence is not None and len(sentence.tokens) > 0: self.sentences.append(sentence) self.total_sentence_count += 1 else: - self.indices = self.__cursor.find().distinct('_id') + self.indices = self.__cursor.find().distinct("_id") self.total_sentence_count = self.__cursor.count_documents() def _parse_document_to_sentence( @@ -1275,7 +1280,9 @@ def _parse_document_to_sentence( sentence = Sentence(text, labels=labels, use_tokenizer=tokenizer) if self.max_tokens_per_doc > 0: - sentence.tokens = sentence.tokens[: min(len(sentence), self.max_tokens_per_doc)] + sentence.tokens = sentence.tokens[ + : min(len(sentence), self.max_tokens_per_doc) + ] return sentence return None @@ -1290,11 +1297,11 @@ def __getitem__(self, index: int = 0) -> Sentence: if self.in_memory: return self.sentences[index] else: - document = self.__cursor.find_one({'_id': index}) + document = self.__cursor.find_one({"_id": index}) sentence = self._parse_document_to_sentence( document[self.text], - [document[_] if _ in document else '' for _ in self.categories], - self.tokenizer + [document[_] if _ in document else "" for _ in self.categories], + self.tokenizer, ) return sentence @@ -1395,16 +1402,13 @@ def __init__(self, dataset_info, in_memory: bool = True, **kwargs): preprocessor = lambda x: x.lower() for image_info in dataset_info: - image = Image(imageURL=image_info['url']) - for caption in image_info['descriptions']: + image = Image(imageURL=image_info["url"]) + for caption in image_info["descriptions"]: # append Sentence-Image data point self.data_points.append( - DataPair( - Sentence(preprocessor(caption), use_tokenizer=True), - image - ) + DataPair(Sentence(preprocessor(caption), use_tokenizer=True), image) ) - self.split.append(int(image_info['split'])) + self.split.append(int(image_info["split"])) def __len__(self): return len(self.data_points) diff --git a/flair/embeddings.py b/flair/embeddings.py index c946c89ce9..b00b5b7d9c 100644 --- a/flair/embeddings.py +++ b/flair/embeddings.py @@ -556,10 +556,7 @@ class HashEmbeddings(TokenEmbeddings): """Standard embeddings with Hashing Trick.""" def __init__( - self, - num_embeddings: int = 1000, - embedding_length: int = 300, - hash_method='md5' + self, num_embeddings: int = 1000, embedding_length: int = 300, hash_method="md5" ): super().__init__() @@ -579,7 +576,6 @@ def __init__( self.to(flair.device) - @property def num_embeddings(self) -> int: return self.__num_embeddings @@ -589,23 +585,18 @@ def embedding_length(self) -> int: return self.__embedding_length def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: - def get_idx_for_item(text): hash_function = hashlib.new(self.__hash_method) - hash_function.update(bytes(str(text), 'utf-8')) + hash_function.update(bytes(str(text), "utf-8")) return int(hash_function.hexdigest(), 16) % self.__num_embeddings hash_sentences = [] for i, sentence in enumerate(sentences): - context_idxs = [ - get_idx_for_item(t.text) for t in sentence.tokens - ] + context_idxs = [get_idx_for_item(t.text) for t in sentence.tokens] hash_sentences.extend(context_idxs) - hash_sentences = torch.tensor(hash_sentences, dtype=torch.long).to( - flair.device - ) + hash_sentences = torch.tensor(hash_sentences, dtype=torch.long).to(flair.device) embedded = self.embedding_layer.forward(hash_sentences) diff --git a/flair/models/sequence_tagger_model.py b/flair/models/sequence_tagger_model.py index afe7d81fa4..76294a5b4e 100644 --- a/flair/models/sequence_tagger_model.py +++ b/flair/models/sequence_tagger_model.py @@ -743,17 +743,19 @@ def _viterbi_decode( for index, (tag_id, tag_scores) in enumerate(zip(best_path, all_scores_np)): if type(tag_id) != int and tag_id.item() != tag_scores.argmax(): swap_index_score = tag_scores.argmax() - all_scores_np[index][tag_id.item()], all_scores_np[index][ - swap_index_score - ] = ( + ( + all_scores_np[index][tag_id.item()], + all_scores_np[index][swap_index_score], + ) = ( all_scores_np[index][swap_index_score], all_scores_np[index][tag_id.item()], ) elif type(tag_id) == int and tag_id != tag_scores.argmax(): swap_index_score = tag_scores.argmax() - all_scores_np[index][tag_id], all_scores_np[index][ - swap_index_score - ] = ( + ( + all_scores_np[index][tag_id], + all_scores_np[index][swap_index_score], + ) = ( all_scores_np[index][swap_index_score], all_scores_np[index][tag_id], ) diff --git a/flair/models/similarity_learning_model.py b/flair/models/similarity_learning_model.py index ee3f5a35ea..c610e13ab3 100644 --- a/flair/models/similarity_learning_model.py +++ b/flair/models/similarity_learning_model.py @@ -168,7 +168,7 @@ def __init__( target_mapping: torch.nn.Module = None, recall_at_points: List[int] = [1, 5, 10, 20], recall_at_points_weights: List[float] = [0.4, 0.3, 0.2, 0.1], - interleave_embedding_updates: bool = False + interleave_embedding_updates: bool = False, ): super(SimilarityLearner, self).__init__() self.source_embeddings: Embeddings = source_embeddings diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py index 330335f9e2..cc2676056e 100644 --- a/flair/trainers/trainer.py +++ b/flair/trainers/trainer.py @@ -83,8 +83,8 @@ def train( sampler=None, use_amp: bool = False, amp_opt_level: str = "O1", - eval_on_train_fraction = 0., - eval_on_train_shuffle = False, + eval_on_train_fraction=0.0, + eval_on_train_shuffle=False, **kwargs, ) -> dict: """ @@ -181,15 +181,24 @@ def train( else False ) log_dev = True if not train_with_dev else False - log_train_part = True if (eval_on_train_fraction == 'dev' or eval_on_train_fraction > 0.) else False + log_train_part = ( + True + if (eval_on_train_fraction == "dev" or eval_on_train_fraction > 0.0) + else False + ) if log_train_part: - train_part_size = len(self.corpus.dev) if eval_on_train_fraction == 'dev' \ - else int(len(self.corpus.train) * eval_on_train_fraction) - assert(train_part_size > 0) + train_part_size = ( + len(self.corpus.dev) + if eval_on_train_fraction == "dev" + else int(len(self.corpus.train) * eval_on_train_fraction) + ) + assert train_part_size > 0 if not eval_on_train_shuffle: train_part_indices = list(range(train_part_size)) - train_part = torch.utils.data.dataset.Subset(self.corpus.train, train_part_indices) + train_part = torch.utils.data.dataset.Subset( + self.corpus.train, train_part_indices + ) # prepare loss logging file and set up header loss_txt = init_output_file(base_path, "loss.tsv") @@ -248,7 +257,9 @@ def train( train_part_indices = list(range(self.corpus.train)) random.shuffle(train_part_indices) train_part_indices = train_part_indices[:train_part_size] - train_part = torch.utils.data.dataset.Subset(self.corpus.train, train_part_indices) + train_part = torch.utils.data.dataset.Subset( + self.corpus.train, train_part_indices + ) # get new learning rate for group in optimizer.param_groups: @@ -384,11 +395,13 @@ def train( DataLoader( train_part, batch_size=mini_batch_chunk_size, - num_workers=num_workers + num_workers=num_workers, ), - embedding_storage_mode=embeddings_storage_mode + embedding_storage_mode=embeddings_storage_mode, + ) + result_line += ( + f"\t{train_part_loss}\t{train_part_eval_result.log_line}" ) - result_line += f"\t{train_part_loss}\t{train_part_eval_result.log_line}" log.info( f"TRAIN_SPLIT : loss {train_part_loss} - score {train_part_eval_result.main_score}" ) @@ -483,7 +496,9 @@ def train( if log_train_part: f.write( "\tTRAIN_PART_LOSS\tTRAIN_PART_" - + "\tTRAIN_PART_".join(train_part_eval_result.log_header.split("\t")) + + "\tTRAIN_PART_".join( + train_part_eval_result.log_header.split("\t") + ) ) if log_dev: f.write( diff --git a/flair/visual/training_curves.py b/flair/visual/training_curves.py index 4abe34736c..f40a1749b7 100644 --- a/flair/visual/training_curves.py +++ b/flair/visual/training_curves.py @@ -9,16 +9,10 @@ import matplotlib import math -# to enable %matplotlib inline if running in ipynb -from IPython import get_ipython -ipy = get_ipython() -if ipy is not None: - ipy.run_line_magic("matplotlib", "inline") - -# change from Agg to TkAgg for interative mode +# change from Agg to TkAgg for interactive mode try: - # change from Agg to TkAgg for interative mode + # change from Agg to TkAgg for interactive mode matplotlib.use("TkAgg") except: pass diff --git a/requirements.txt b/requirements.txt index 294a10c3c3..934f096774 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +python-dateutil==2.8.0 torch>=1.1.0 gensim>=3.4.0 pytest>=3.6.4 @@ -15,8 +16,4 @@ regex tabulate urllib3<1.25,>=1.20 langdetect -torchvision -ipython==7.6.1 -ipython-genutils==0.2.0 -tiny_tokenizer[all] -pymongo \ No newline at end of file +torchvision \ No newline at end of file diff --git a/tests/test_data.py b/tests/test_data.py index 0b9d2fddfd..dbda5e6be6 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -12,7 +12,6 @@ Corpus, Span, segtok_tokenizer, - build_japanese_tokenizer, ) @@ -46,15 +45,15 @@ def test_create_sentence_without_tokenizer(): assert "Berlin." == sentence.tokens[2].text -def test_create_sentence_using_japanese_tokenizer(): - sentence: Sentence = Sentence("私はベルリンが好き", use_tokenizer=build_japanese_tokenizer()) - - assert 5 == len(sentence.tokens) - assert "私" == sentence.tokens[0].text - assert "は" == sentence.tokens[1].text - assert "ベルリン" == sentence.tokens[2].text - assert "が" == sentence.tokens[3].text - assert "好き" == sentence.tokens[4].text +# def test_create_sentence_using_japanese_tokenizer(): +# sentence: Sentence = Sentence("私はベルリンが好き", use_tokenizer=build_japanese_tokenizer()) +# +# assert 5 == len(sentence.tokens) +# assert "私" == sentence.tokens[0].text +# assert "は" == sentence.tokens[1].text +# assert "ベルリン" == sentence.tokens[2].text +# assert "が" == sentence.tokens[3].text +# assert "好き" == sentence.tokens[4].text def test_token_indices(): diff --git a/tests/test_transformer_embeddings.py b/tests/test_transformer_embeddings.py index 81ebe15901..73264e0b65 100644 --- a/tests/test_transformer_embeddings.py +++ b/tests/test_transformer_embeddings.py @@ -31,7 +31,7 @@ def calculate_mean_embedding( - subword_embeddings: List[torch.FloatTensor] + subword_embeddings: List[torch.FloatTensor], ) -> torch.FloatTensor: all_embeddings: List[torch.FloatTensor] = [ embedding.unsqueeze(0) for embedding in subword_embeddings