Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Get a Vocabulary object from the reader #4034

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 25 additions & 8 deletions allennlp/commands/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,12 +555,12 @@ def from_partial_objects(
model: Lazy[Model],
data_loader: Lazy[DataLoader],
trainer: Lazy[Trainer],
vocabulary: Lazy[Vocabulary] = None,
datasets_for_vocab_creation: List[str] = None,
validation_dataset_reader: DatasetReader = None,
validation_data_path: str = None,
validation_data_loader: Lazy[DataLoader] = None,
test_data_path: str = None,
vocabulary: Optional[Lazy[Vocabulary]] = None,
datasets_for_vocab_creation: Optional[List[str]] = None,
validation_dataset_reader: Optional[DatasetReader] = None,
validation_data_path: Optional[str] = None,
validation_data_loader: Optional[Lazy[DataLoader]] = None,
test_data_path: Optional[str] = None,
evaluate_on_test: bool = False,
) -> "TrainModel":
"""
Expand Down Expand Up @@ -636,21 +636,38 @@ def from_partial_objects(
test_data_path=test_data_path,
)

if datasets_for_vocab_creation:
if datasets_for_vocab_creation is None:
datasets_for_vocab_creation = datasets.keys()
else:
for key in datasets_for_vocab_creation:
if key not in datasets:
raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {key}")

key_to_dataset_reader = {
"train": dataset_reader,
"test": validation_dataset_reader or dataset_reader,
"validation": validation_dataset_reader or dataset_reader,
}
vocabulary_from_readers = None
for key in datasets_for_vocab_creation:
reader_vocab = key_to_dataset_reader[key].get_vocabulary()
if vocabulary_from_readers is None:
vocabulary_from_readers = reader_vocab
else:
vocabulary_from_readers.extend_from_vocab(reader_vocab)

instance_generator = (
instance
for key, dataset in datasets.items()
if not datasets_for_vocab_creation or key in datasets_for_vocab_creation
if key in datasets_for_vocab_creation
for instance in dataset
)

vocabulary_ = vocabulary.construct(instances=instance_generator)
if not vocabulary_:
vocabulary_ = Vocabulary.from_instances(instance_generator)
if vocabulary_from_readers is not None:
vocabulary_.extend_from_vocab(vocabulary_from_readers)
model_ = model.construct(vocab=vocabulary_)

# Initializing the model can have side effect of expanding the vocabulary.
Expand Down
6 changes: 6 additions & 0 deletions allennlp/data/dataset_readers/dataset_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,12 @@ def _read(self, file_path: str) -> Iterable[Instance]:
"""
raise NotImplementedError

def get_vocabulary(self) -> Optional[Vocabulary]:
"""Returns the vocabulary used in the created instances. By default, this
returns `None`, which causes the vocabulary to be automatically discovered
Copy link
Contributor

@matt-gardner matt-gardner Apr 7, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't actually the behavior of the current code in train.py. But we can assume that we fix that to match this description. What I would like to wrap my head around is what happens in the following cases:

  1. A glove vocabulary
  2. A BERT QA model (no tag vocab needed)
  3. A glove tagging model
  4. A BERT tagging model

What do you imagine a person specifying where, and how does vocabulary creation happen? Putting this method here is less scary than I originally thought, but I'm still somewhat skeptical that we really gain what you're hoping. Outlining what code / parameters a person has to specify in the above cases would help clarify that.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1:

def get_vocabulary(self):
    # if we write a utility function
    return Vocabulary.from_glove(d=100)
    
    # if we don't write a utility function
    vocabulary = Vocabulary.empty()
    with open(cached_path("http://.../vocab.txt")) as f:
        vocabulary.add_tokens_to_namespace([t.strip() for t in f], "tokens")
    return vocabulary

    # or we could write a utility function for loading from txt files
    return Vocabulary.from_txt("http://.../vocab.txt")

This is assuming that Glove ships with a vocab.txt where each line contains a vocab item, and the order matches the order of the vectors.

2:

def get_vocabulary(self):
    return Vocabulary.from_transformers(self.transformer_model)

3:

def get_vocabulary(self):
    vocabulary = Vocabulary.from_glove(d=100)
    vocabulary.add_tokens_to_namespace(self.possible_tags, "tags")
    return vocabulary

4:

def get_vocabulary(self):
    vocabulary = Vocabulary.from_transformers(self.transformer_model)
    vocabulary.add_tokens_to_namespace(self.possible_tags, "tags")
    return vocabulary

How does the current mechanism make sure that in the case of Glove, we get the right indices for the right tokens?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For case 1, we typically don't load the whole glove vocabulary, because that's massive. We just load the embeddings for the tokens that we've seen, saving a ton of space. You can't do that without looking through your instances and computing a vocabulary.

For cases 3 and 4, if you're saying that the reader has to hard-code a list of tags / labels, this seems problematic. It means that I can't have a generic "classification csv file" dataset reader, because the reader would have to know the label set.

before training."""
return None

def _instances_from_cache_file(self, cache_filename: str) -> Iterable[Instance]:
with open(cache_filename, "r") as cache_file:
for line in cache_file:
Expand Down
16 changes: 16 additions & 0 deletions allennlp/data/vocabulary.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,14 @@ def from_files_and_instances(
)
return vocab

@classmethod
def from_transformers(cls, model_name: str, namespace: str = "tokens"):
vocab = cls.empty()
import transformers
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
vocab.extend_from_dictionary(tokenizer.get_vocab(), namespace)
return vocab

@classmethod
def empty(cls) -> "Vocabulary":
"""
Expand Down Expand Up @@ -465,6 +473,14 @@ def extend_from_vocab(self, vocab: "Vocabulary") -> None:
for token in vocab.get_token_to_index_vocabulary(namespace):
self.add_token_to_namespace(token, namespace)

def extend_from_dictionary(self, encoding_dictionary: Dict[str, int], namespace: str = "from_transformers") -> None:
"""
Populates given namespace with precomputed encoding, for example from pretrained transformers.
"""
for word, idx in encoding_dictionary.items():
self._token_to_index[namespace][word] = idx
self._index_to_token[namespace][idx] = word

def _extend(
self,
counter: Dict[str, Dict[str, int]] = None,
Expand Down