Skip to content

Commit

Permalink
Merge pull request #18 from RasaHQ/bugfixes
Browse files Browse the repository at this point in the history
Bugfixes
  • Loading branch information
koaning authored Jul 23, 2020
2 parents 24620ad + 210b5b8 commit 75e011e
Show file tree
Hide file tree
Showing 7 changed files with 130 additions and 18 deletions.
4 changes: 3 additions & 1 deletion docs/docs/featurizer/bytepair.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,4 +91,6 @@ Note that in this case we expect two files to be present in the `tests/data` dir
- `en.wiki.bpe.vs10000.model`

You can also overwrite the names of these files via the `model_file` and `emb_file` settings. But it
is preferable to stick to the library naming convention.
is preferable to stick to the library naming convention. Also note that if you use the `model_file` and
`emb_file` settings that you must provide full filepaths and that the `cache_dir` will be ignored. It is
still considered good usage to manually specify the `lang`, `dim` and `vs` parameter in this situation.
2 changes: 0 additions & 2 deletions rasa_nlu_examples/errors.py

This file was deleted.

41 changes: 36 additions & 5 deletions rasa_nlu_examples/featurizers/dense/bpemb_featurizer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import typing
from pathlib import Path
from typing import Any, Optional, Text, Dict, List, Type
Expand Down Expand Up @@ -35,20 +36,19 @@ def required_packages(cls) -> List[Text]:

defaults = {
# specifies the language of the subword segmentation model
"lang": "en",
"lang": None,
# specifies the dimension of the subword embeddings
"dim": 25,
"dim": None,
# specifies the vocabulary size of the segmentation model
"vs": 1000,
"vs": None,
# if set to True and the given vocabulary size can't be loaded for the given
# model, the closest size is chosen
"vs_fallback": True,
# specifies the folder in which downloaded BPEmb files will be cached
"cache_dir": str(Path.home() / Path(".cache/bpemb")),
# specifies the path to a custom SentencePiece model file
"model_file": None,
# specifies the path to a custom embedding file. Supported formats are Word2Vec
# plain text and GenSim binary.
# specifies the path to a custom embedding file
"emb_file": None,
}

Expand Down Expand Up @@ -343,12 +343,43 @@ def required_packages(cls) -> List[Text]:
def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
super().__init__(component_config)

model_file, emb_file = (
self.component_config[k] for k in ["model_file", "emb_file"]
)
if model_file:
if not os.path.exists(model_file):
raise FileNotFoundError(
f"BytePair model {model_file} not found. Please check config."
)
if emb_file:
if not os.path.exists(emb_file):
raise FileNotFoundError(
f"BytePair embedding file {emb_file} not found. Please check config."
)

if not self.component_config["lang"]:
raise ValueError(
"You must specify the `lang` parameter for BytePairEmbedding in `config.yml`."
)

if not self.component_config["vs"]:
raise ValueError(
"You must specify the `vs` parameter for BytePairEmbedding in `config.yml`."
)

if not self.component_config["dim"]:
raise ValueError(
"You must specify the `dim` parameter for BytePairEmbedding in `config.yml`."
)

self.model = BPEmb(
lang=self.component_config["lang"],
dim=self.component_config["dim"],
vs=self.component_config["vs"],
vs_fallback=self.component_config["vs_fallback"],
cache_dir=self.component_config["cache_dir"],
model_file=self.component_config["model_file"],
emb_file=self.component_config["emb_file"],
)

def train(
Expand Down
11 changes: 10 additions & 1 deletion rasa_nlu_examples/featurizers/dense/fasttext_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
TOKENS_NAMES,
)


if typing.TYPE_CHECKING:
from rasa.nlu.model import Metadata

Expand All @@ -39,6 +38,16 @@ def required_packages(cls) -> List[Text]:
def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
super().__init__(component_config)
path = os.path.join(component_config["cache_dir"], component_config["file"])

if not os.path.exists(component_config["cache_dir"]):
raise FileNotFoundError(
f"It seems that the cache dir {component_config['cache_dir']} does not exists. Please check config."
)
if not os.path.exists(path):
raise FileNotFoundError(
f"It seems that file {path} does not exists. Please check config."
)

self.model = fasttext.load_model(path)

def train(
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from setuptools import setup, find_packages

base_packages = ["rasa>=1.10.0", "fasttext==0.9.2", "bpemb==0.3.0"]
base_packages = ["rasa>=1.10.0", "fasttext==0.9.2", "bpemb==0.3.2"]

dev_packages = [
"flake8>=3.6.0",
Expand All @@ -17,7 +17,7 @@

setup(
name="rasa_nlu_examples",
version="0.1.0",
version="0.1.1",
packages=find_packages(exclude=["notebooks"]),
install_requires=base_packages,
extras_require={"dev": dev_packages},
Expand Down
71 changes: 64 additions & 7 deletions tests/test_featurizers/test_bpemb_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,76 @@
from rasa_nlu_examples.featurizers.dense import BytePairFeaturizer
from .featurizer_checks import dense_standard_test_combinations


config = dict(lang="en", vs=1000, dim=25, vs_fallback=True)
config_man = dict(
lang="en",
vs=1000,
dim=25,
model_file="tests/data/en/en.wiki.bpe.vs1000.model",
emb_file="tests/data/en/en.wiki.bpe.vs1000.d25.w2v.bin",
)
config_auto = dict(lang="en", vs=1000, dim=25, vs_fallback=True)
tokenizer = WhitespaceTokenizer()
featurizer = BytePairFeaturizer(component_config=config)


def test_model_loaded():
assert featurizer
@pytest.mark.parametrize(
"test_fn,tok,feat,msg",
dense_standard_test_combinations(
tokenizer=tokenizer, featurizer=BytePairFeaturizer(component_config=config_auto)
),
)
def test_auto_featurizer_checks(test_fn, tok, feat, msg):
test_fn(tok, feat, msg)


@pytest.mark.parametrize(
"test_fn,tok,feat,msg",
dense_standard_test_combinations(tokenizer=tokenizer, featurizer=featurizer),
dense_standard_test_combinations(
tokenizer=tokenizer, featurizer=BytePairFeaturizer(component_config=config_man)
),
)
def test_featurizer_checks(test_fn, tok, feat, msg):
def test_man_featurizer_checks(test_fn, tok, feat, msg):
test_fn(tok, feat, msg)


def test_raise_error_missing_model_file():
config_bad = dict(
lang="en",
vs=1000,
dim=25,
model_file="tests/data/en/en.dinosaur.bpe.vs1000.model",
emb_file="tests/data/en/en.wiki.bpe.vs1000.d25.w2v.bin",
)
with pytest.raises(FileNotFoundError):
BytePairFeaturizer(component_config=config_bad)


def test_raise_error_missing_emb_file():
config_bad = dict(
lang="en",
vs=1000,
dim=25,
model_file="tests/data/en/en.wiki.bpe.vs1000.model",
emb_file="tests/data/en/en.wiki.dinosaur.vs1000.d25.w2v.bin",
)
with pytest.raises(FileNotFoundError):
BytePairFeaturizer(component_config=config_bad)


def test_config_missing():
config_bad = dict(
lang="en",
vs=1000,
dim=25,
model_file="tests/data/en/en.wiki.bpe.vs1000.model",
emb_file="tests/data/en/en.wiki.dinosaur.vs1000.d25.w2v.bin",
)
with pytest.raises(FileNotFoundError):
BytePairFeaturizer(component_config=config_bad)


@pytest.mark.parametrize(
"conf", [dict(lang="en", vs=1000), dict(lang="en", dim=25), dict(dim=25, vs=1000)]
)
def test_raise_missing_error(conf):
with pytest.raises(ValueError):
BytePairFeaturizer(component_config=conf)
15 changes: 15 additions & 0 deletions tests/test_featurizers/test_fasttext_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,18 @@ def test_model_loaded():
)
def test_featurizer_checks(test_fn, tok, feat, msg):
test_fn(tok, feat, msg)


def test_raise_cachedir_error():
bad_folder = str(test_folder / "foobar")
with pytest.raises(FileNotFoundError):
FastTextFeaturizer(
component_config={"cache_dir": bad_folder, "file": file_name}
)


def test_raise_file_error():
with pytest.raises(FileNotFoundError):
FastTextFeaturizer(
component_config={"cache_dir": test_folder, "file": "dinosaur.bin"}
)

0 comments on commit 75e011e

Please sign in to comment.