Skip to content

Commit

Permalink
Merge pull request #1296 from zalandoresearch/GH-1245-remove-dependen…
Browse files Browse the repository at this point in the history
…cies

GH-1245: slim down dependency tree
  • Loading branch information
Alan Akbik authored Nov 27, 2019
2 parents 671f274 + 8e3b09b commit fa3be40
Show file tree
Hide file tree
Showing 11 changed files with 118 additions and 104 deletions.
2 changes: 0 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@ env:
python:
- "3.6"
install:
- sudo apt update -y
- sudo apt install mecab libmecab-dev mecab-ipadic
- python setup.py develop -q
before_script: cd tests
script:
Expand Down
34 changes: 24 additions & 10 deletions flair/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,6 @@
from collections import Counter
from collections import defaultdict

from tiny_tokenizer import WordTokenizer
from tiny_tokenizer import SentenceTokenizer

from segtok.segmenter import split_single
from segtok.tokenizer import split_contractions
from segtok.tokenizer import word_tokenizer
Expand Down Expand Up @@ -432,8 +429,23 @@ def build_japanese_tokenizer(tokenizer: str = "MeCab"):
if tokenizer.lower() != "mecab":
raise NotImplementedError("Currently, MeCab is only supported.")

sentence_tokenizer = SentenceTokenizer()
word_tokenizer = WordTokenizer(tokenizer)
try:
import tiny_tokenizer
except ModuleNotFoundError:
log.warning("-" * 100)
log.warning('ATTENTION! The library "tiny_tokenizer" is not installed!')
log.warning(
'To use Japanese tokenizer, please first install with the following steps: "pip install allennlp"'
)
log.warning(
'- Install mecab with "sudo apt install mecab libmecab-dev mecab-ipadic"'
)
log.warning('- Install tiny_tokenizer with "pip install tiny_tokenizer[all]"')
log.warning("-" * 100)
pass

sentence_tokenizer = tiny_tokenizer.SentenceTokenizer()
word_tokenizer = tiny_tokenizer.WordTokenizer(tokenizer)

def tokenizer(text: str) -> List[Token]:
"""
Expand Down Expand Up @@ -463,7 +475,9 @@ def tokenizer(text: str) -> List[Token]:
current_offset + 1 if current_offset > 0 else current_offset
)

token = Token(text=word, start_position=start_position, whitespace_after=True)
token = Token(
text=word, start_position=start_position, whitespace_after=True
)
tokens.append(token)

if (previous_token is not None) and word_offset - 1 == previous_word_offset:
Expand Down Expand Up @@ -507,9 +521,9 @@ def segtok_tokenizer(text: str) -> List[Token]:
)

if word:
token = Token(text=word,
start_position=start_position,
whitespace_after=True)
token = Token(
text=word, start_position=start_position, whitespace_after=True
)
tokens.append(token)

if (previous_token is not None) and word_offset - 1 == previous_word_offset:
Expand Down Expand Up @@ -933,7 +947,7 @@ def get_language_code(self) -> str:

return self.language_code

def _restore_windows_1252_characters(self, text:str)->str:
def _restore_windows_1252_characters(self, text: str) -> str:
def to_windows_1252(match):
try:
return bytes([ord(match.group(0))]).decode("windows-1252")
Expand Down
76 changes: 40 additions & 36 deletions flair/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
from pathlib import Path
from typing import List, Dict, Union, Callable

import pymongo

import numpy as np
import json
import urllib
Expand Down Expand Up @@ -314,24 +312,24 @@ def __init__(

class FeideggerCorpus(Corpus):
def __init__(self, **kwargs):
dataset = 'feidegger'
dataset = "feidegger"

# cache Feidegger config file
json_link = 'https://raw.githubusercontent.com/zalandoresearch/feidegger/master/data/FEIDEGGER_release_1.1.json'
json_local_path = cached_path(json_link, Path('datasets') / dataset)
json_link = "https://raw.githubusercontent.com/zalandoresearch/feidegger/master/data/FEIDEGGER_release_1.1.json"
json_local_path = cached_path(json_link, Path("datasets") / dataset)

# cache Feidegger images
dataset_info = json.load(open(json_local_path, 'r'))
images_cache_folder = os.path.join(os.path.dirname(json_local_path), 'images')
dataset_info = json.load(open(json_local_path, "r"))
images_cache_folder = os.path.join(os.path.dirname(json_local_path), "images")
if not os.path.isdir(images_cache_folder):
os.mkdir(images_cache_folder)
for image_info in tqdm(dataset_info):
name = os.path.basename(image_info['url'])
name = os.path.basename(image_info["url"])
filename = os.path.join(images_cache_folder, name)
if not os.path.isfile(filename):
urllib.request.urlretrieve(image_info['url'], filename)
urllib.request.urlretrieve(image_info["url"], filename)
# replace image URL with local cached file
image_info['url'] = filename
image_info["url"] = filename

feidegger_dataset: Dataset = FeideggerDataset(dataset_info, **kwargs)

Expand Down Expand Up @@ -1184,15 +1182,14 @@ def __getitem__(self, index: int = 0) -> Sentence:


class MongoDataset(FlairDataset):

def __init__(
self,
query: str = None,
host: str = 'localhost',
port: int = 27017,
database: str = 'rosenberg',
collection: str = 'book',
text_field: str = 'Beskrivning',
query: str,
host: str,
port: int,
database: str,
collection: str,
text_field: str,
categories_field: List[str] = None,
max_tokens_per_doc: int = -1,
max_chars_per_doc: int = -1,
Expand Down Expand Up @@ -1225,6 +1222,18 @@ def __init__(
:return: list of sentences
"""

# first, check if pymongo is installed
try:
import pymongo
except ModuleNotFoundError:
log.warning("-" * 100)
log.warning('ATTENTION! The library "pymongo" is not installed!')
log.warning(
'To use MongoDataset, please first install with "pip install pymongo"'
)
log.warning("-" * 100)
pass

self.in_memory = in_memory
self.tokenizer = tokenizer

Expand All @@ -1245,24 +1254,20 @@ def __init__(

start = 0

kwargs = lambda start: {
'filter': query,
'skip': start,
'limit': 0
}
kwargs = lambda start: {"filter": query, "skip": start, "limit": 0}

if self.in_memory:
for document in self.__cursor.find(**kwargs(start)):
sentence = self._parse_document_to_sentence(
document[self.text],
[document[_] if _ in document else '' for _ in self.categories],
tokenizer
[document[_] if _ in document else "" for _ in self.categories],
tokenizer,
)
if sentence is not None and len(sentence.tokens) > 0:
self.sentences.append(sentence)
self.total_sentence_count += 1
else:
self.indices = self.__cursor.find().distinct('_id')
self.indices = self.__cursor.find().distinct("_id")
self.total_sentence_count = self.__cursor.count_documents()

def _parse_document_to_sentence(
Expand All @@ -1275,7 +1280,9 @@ def _parse_document_to_sentence(
sentence = Sentence(text, labels=labels, use_tokenizer=tokenizer)

if self.max_tokens_per_doc > 0:
sentence.tokens = sentence.tokens[: min(len(sentence), self.max_tokens_per_doc)]
sentence.tokens = sentence.tokens[
: min(len(sentence), self.max_tokens_per_doc)
]

return sentence
return None
Expand All @@ -1290,11 +1297,11 @@ def __getitem__(self, index: int = 0) -> Sentence:
if self.in_memory:
return self.sentences[index]
else:
document = self.__cursor.find_one({'_id': index})
document = self.__cursor.find_one({"_id": index})
sentence = self._parse_document_to_sentence(
document[self.text],
[document[_] if _ in document else '' for _ in self.categories],
self.tokenizer
[document[_] if _ in document else "" for _ in self.categories],
self.tokenizer,
)
return sentence

Expand Down Expand Up @@ -1395,16 +1402,13 @@ def __init__(self, dataset_info, in_memory: bool = True, **kwargs):
preprocessor = lambda x: x.lower()

for image_info in dataset_info:
image = Image(imageURL=image_info['url'])
for caption in image_info['descriptions']:
image = Image(imageURL=image_info["url"])
for caption in image_info["descriptions"]:
# append Sentence-Image data point
self.data_points.append(
DataPair(
Sentence(preprocessor(caption), use_tokenizer=True),
image
)
DataPair(Sentence(preprocessor(caption), use_tokenizer=True), image)
)
self.split.append(int(image_info['split']))
self.split.append(int(image_info["split"]))

def __len__(self):
return len(self.data_points)
Expand Down
17 changes: 4 additions & 13 deletions flair/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,10 +556,7 @@ class HashEmbeddings(TokenEmbeddings):
"""Standard embeddings with Hashing Trick."""

def __init__(
self,
num_embeddings: int = 1000,
embedding_length: int = 300,
hash_method='md5'
self, num_embeddings: int = 1000, embedding_length: int = 300, hash_method="md5"
):

super().__init__()
Expand All @@ -579,7 +576,6 @@ def __init__(

self.to(flair.device)


@property
def num_embeddings(self) -> int:
return self.__num_embeddings
Expand All @@ -589,23 +585,18 @@ def embedding_length(self) -> int:
return self.__embedding_length

def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:

def get_idx_for_item(text):
hash_function = hashlib.new(self.__hash_method)
hash_function.update(bytes(str(text), 'utf-8'))
hash_function.update(bytes(str(text), "utf-8"))
return int(hash_function.hexdigest(), 16) % self.__num_embeddings

hash_sentences = []
for i, sentence in enumerate(sentences):
context_idxs = [
get_idx_for_item(t.text) for t in sentence.tokens
]
context_idxs = [get_idx_for_item(t.text) for t in sentence.tokens]

hash_sentences.extend(context_idxs)

hash_sentences = torch.tensor(hash_sentences, dtype=torch.long).to(
flair.device
)
hash_sentences = torch.tensor(hash_sentences, dtype=torch.long).to(flair.device)

embedded = self.embedding_layer.forward(hash_sentences)

Expand Down
14 changes: 8 additions & 6 deletions flair/models/sequence_tagger_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,17 +743,19 @@ def _viterbi_decode(
for index, (tag_id, tag_scores) in enumerate(zip(best_path, all_scores_np)):
if type(tag_id) != int and tag_id.item() != tag_scores.argmax():
swap_index_score = tag_scores.argmax()
all_scores_np[index][tag_id.item()], all_scores_np[index][
swap_index_score
] = (
(
all_scores_np[index][tag_id.item()],
all_scores_np[index][swap_index_score],
) = (
all_scores_np[index][swap_index_score],
all_scores_np[index][tag_id.item()],
)
elif type(tag_id) == int and tag_id != tag_scores.argmax():
swap_index_score = tag_scores.argmax()
all_scores_np[index][tag_id], all_scores_np[index][
swap_index_score
] = (
(
all_scores_np[index][tag_id],
all_scores_np[index][swap_index_score],
) = (
all_scores_np[index][swap_index_score],
all_scores_np[index][tag_id],
)
Expand Down
2 changes: 1 addition & 1 deletion flair/models/similarity_learning_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def __init__(
target_mapping: torch.nn.Module = None,
recall_at_points: List[int] = [1, 5, 10, 20],
recall_at_points_weights: List[float] = [0.4, 0.3, 0.2, 0.1],
interleave_embedding_updates: bool = False
interleave_embedding_updates: bool = False,
):
super(SimilarityLearner, self).__init__()
self.source_embeddings: Embeddings = source_embeddings
Expand Down
39 changes: 27 additions & 12 deletions flair/trainers/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,8 @@ def train(
sampler=None,
use_amp: bool = False,
amp_opt_level: str = "O1",
eval_on_train_fraction = 0.,
eval_on_train_shuffle = False,
eval_on_train_fraction=0.0,
eval_on_train_shuffle=False,
**kwargs,
) -> dict:
"""
Expand Down Expand Up @@ -181,15 +181,24 @@ def train(
else False
)
log_dev = True if not train_with_dev else False
log_train_part = True if (eval_on_train_fraction == 'dev' or eval_on_train_fraction > 0.) else False
log_train_part = (
True
if (eval_on_train_fraction == "dev" or eval_on_train_fraction > 0.0)
else False
)

if log_train_part:
train_part_size = len(self.corpus.dev) if eval_on_train_fraction == 'dev' \
else int(len(self.corpus.train) * eval_on_train_fraction)
assert(train_part_size > 0)
train_part_size = (
len(self.corpus.dev)
if eval_on_train_fraction == "dev"
else int(len(self.corpus.train) * eval_on_train_fraction)
)
assert train_part_size > 0
if not eval_on_train_shuffle:
train_part_indices = list(range(train_part_size))
train_part = torch.utils.data.dataset.Subset(self.corpus.train, train_part_indices)
train_part = torch.utils.data.dataset.Subset(
self.corpus.train, train_part_indices
)

# prepare loss logging file and set up header
loss_txt = init_output_file(base_path, "loss.tsv")
Expand Down Expand Up @@ -248,7 +257,9 @@ def train(
train_part_indices = list(range(self.corpus.train))
random.shuffle(train_part_indices)
train_part_indices = train_part_indices[:train_part_size]
train_part = torch.utils.data.dataset.Subset(self.corpus.train, train_part_indices)
train_part = torch.utils.data.dataset.Subset(
self.corpus.train, train_part_indices
)

# get new learning rate
for group in optimizer.param_groups:
Expand Down Expand Up @@ -384,11 +395,13 @@ def train(
DataLoader(
train_part,
batch_size=mini_batch_chunk_size,
num_workers=num_workers
num_workers=num_workers,
),
embedding_storage_mode=embeddings_storage_mode
embedding_storage_mode=embeddings_storage_mode,
)
result_line += (
f"\t{train_part_loss}\t{train_part_eval_result.log_line}"
)
result_line += f"\t{train_part_loss}\t{train_part_eval_result.log_line}"
log.info(
f"TRAIN_SPLIT : loss {train_part_loss} - score {train_part_eval_result.main_score}"
)
Expand Down Expand Up @@ -483,7 +496,9 @@ def train(
if log_train_part:
f.write(
"\tTRAIN_PART_LOSS\tTRAIN_PART_"
+ "\tTRAIN_PART_".join(train_part_eval_result.log_header.split("\t"))
+ "\tTRAIN_PART_".join(
train_part_eval_result.log_header.split("\t")
)
)
if log_dev:
f.write(
Expand Down
Loading

0 comments on commit fa3be40

Please sign in to comment.