Merge pull request #18 from RasaHQ/bugfixes

Bugfixes
RasaHQ · Jul 23, 2020 · 75e011e · 75e011e
2 parents 24620ad + 210b5b8
commit 75e011e
Show file tree

Hide file tree

Showing 7 changed files with 130 additions and 18 deletions.
diff --git a/docs/docs/featurizer/bytepair.md b/docs/docs/featurizer/bytepair.md
@@ -91,4 +91,6 @@ Note that in this case we expect two files to be present in the `tests/data` dir
 - `en.wiki.bpe.vs10000.model`
 
 You can also overwrite the names of these files via the `model_file` and `emb_file` settings. But it
-is preferable to stick to the library naming convention.
+is preferable to stick to the library naming convention. Also note that if you use the `model_file` and
+`emb_file` settings that you must provide full filepaths and that the `cache_dir` will be ignored. It is
+still considered good usage to manually specify the `lang`, `dim` and `vs` parameter in this situation.
diff --git a/rasa_nlu_examples/errors.py b/rasa_nlu_examples/errors.py
diff --git a/rasa_nlu_examples/featurizers/dense/bpemb_featurizer.py b/rasa_nlu_examples/featurizers/dense/bpemb_featurizer.py
@@ -1,3 +1,4 @@
+import os
 import typing
 from pathlib import Path
 from typing import Any, Optional, Text, Dict, List, Type
@@ -35,20 +36,19 @@ def required_packages(cls) -> List[Text]:
 
     defaults = {
         # specifies the language of the subword segmentation model
-        "lang": "en",
+        "lang": None,
         # specifies the dimension of the subword embeddings
-        "dim": 25,
+        "dim": None,
         # specifies the vocabulary size of the segmentation model
-        "vs": 1000,
+        "vs": None,
         # if set to True and the given vocabulary size can't be loaded for the given
         # model, the closest size is chosen
         "vs_fallback": True,
         # specifies the folder in which downloaded BPEmb files will be cached
         "cache_dir": str(Path.home() / Path(".cache/bpemb")),
         # specifies the path to a custom SentencePiece model file
         "model_file": None,
-        # specifies the path to a custom embedding file. Supported formats are Word2Vec
-        # plain text and GenSim binary.
+        # specifies the path to a custom embedding file
         "emb_file": None,
     }
 
@@ -343,12 +343,43 @@ def required_packages(cls) -> List[Text]:
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         super().__init__(component_config)
 
+        model_file, emb_file = (
+            self.component_config[k] for k in ["model_file", "emb_file"]
+        )
+        if model_file:
+            if not os.path.exists(model_file):
+                raise FileNotFoundError(
+                    f"BytePair model {model_file} not found. Please check config."
+                )
+        if emb_file:
+            if not os.path.exists(emb_file):
+                raise FileNotFoundError(
+                    f"BytePair embedding file {emb_file} not found. Please check config."
+                )
+
+        if not self.component_config["lang"]:
+            raise ValueError(
+                "You must specify the `lang` parameter for BytePairEmbedding in `config.yml`."
+            )
+
+        if not self.component_config["vs"]:
+            raise ValueError(
+                "You must specify the `vs` parameter for BytePairEmbedding in `config.yml`."
+            )
+
+        if not self.component_config["dim"]:
+            raise ValueError(
+                "You must specify the `dim` parameter for BytePairEmbedding in `config.yml`."
+            )
+
         self.model = BPEmb(
             lang=self.component_config["lang"],
             dim=self.component_config["dim"],
             vs=self.component_config["vs"],
             vs_fallback=self.component_config["vs_fallback"],
             cache_dir=self.component_config["cache_dir"],
+            model_file=self.component_config["model_file"],
+            emb_file=self.component_config["emb_file"],
         )
 
     def train(

diff --git a/rasa_nlu_examples/featurizers/dense/fasttext_featurizer.py b/rasa_nlu_examples/featurizers/dense/fasttext_featurizer.py
@@ -17,7 +17,6 @@
     TOKENS_NAMES,
 )
 
-
 if typing.TYPE_CHECKING:
     from rasa.nlu.model import Metadata
 
@@ -39,6 +38,16 @@ def required_packages(cls) -> List[Text]:
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         super().__init__(component_config)
         path = os.path.join(component_config["cache_dir"], component_config["file"])
+
+        if not os.path.exists(component_config["cache_dir"]):
+            raise FileNotFoundError(
+                f"It seems that the cache dir {component_config['cache_dir']} does not exists. Please check config."
+            )
+        if not os.path.exists(path):
+            raise FileNotFoundError(
+                f"It seems that file {path} does not exists. Please check config."
+            )
+
         self.model = fasttext.load_model(path)
 
     def train(

diff --git a/setup.py b/setup.py
@@ -1,6 +1,6 @@
 from setuptools import setup, find_packages
 
-base_packages = ["rasa>=1.10.0", "fasttext==0.9.2", "bpemb==0.3.0"]
+base_packages = ["rasa>=1.10.0", "fasttext==0.9.2", "bpemb==0.3.2"]
 
 dev_packages = [
     "flake8>=3.6.0",
@@ -17,7 +17,7 @@
 
 setup(
     name="rasa_nlu_examples",
-    version="0.1.0",
+    version="0.1.1",
     packages=find_packages(exclude=["notebooks"]),
     install_requires=base_packages,
     extras_require={"dev": dev_packages},

diff --git a/tests/test_featurizers/test_bpemb_featurizer.py b/tests/test_featurizers/test_bpemb_featurizer.py
@@ -4,19 +4,76 @@
 from rasa_nlu_examples.featurizers.dense import BytePairFeaturizer
 from .featurizer_checks import dense_standard_test_combinations
 
-
-config = dict(lang="en", vs=1000, dim=25, vs_fallback=True)
+config_man = dict(
+    lang="en",
+    vs=1000,
+    dim=25,
+    model_file="tests/data/en/en.wiki.bpe.vs1000.model",
+    emb_file="tests/data/en/en.wiki.bpe.vs1000.d25.w2v.bin",
+)
+config_auto = dict(lang="en", vs=1000, dim=25, vs_fallback=True)
 tokenizer = WhitespaceTokenizer()
-featurizer = BytePairFeaturizer(component_config=config)
 
 
-def test_model_loaded():
-    assert featurizer
+@pytest.mark.parametrize(
+    "test_fn,tok,feat,msg",
+    dense_standard_test_combinations(
+        tokenizer=tokenizer, featurizer=BytePairFeaturizer(component_config=config_auto)
+    ),
+)
+def test_auto_featurizer_checks(test_fn, tok, feat, msg):
+    test_fn(tok, feat, msg)
 
 
 @pytest.mark.parametrize(
     "test_fn,tok,feat,msg",
-    dense_standard_test_combinations(tokenizer=tokenizer, featurizer=featurizer),
+    dense_standard_test_combinations(
+        tokenizer=tokenizer, featurizer=BytePairFeaturizer(component_config=config_man)
+    ),
 )
-def test_featurizer_checks(test_fn, tok, feat, msg):
+def test_man_featurizer_checks(test_fn, tok, feat, msg):
     test_fn(tok, feat, msg)
+
+
+def test_raise_error_missing_model_file():
+    config_bad = dict(
+        lang="en",
+        vs=1000,
+        dim=25,
+        model_file="tests/data/en/en.dinosaur.bpe.vs1000.model",
+        emb_file="tests/data/en/en.wiki.bpe.vs1000.d25.w2v.bin",
+    )
+    with pytest.raises(FileNotFoundError):
+        BytePairFeaturizer(component_config=config_bad)
+
+
+def test_raise_error_missing_emb_file():
+    config_bad = dict(
+        lang="en",
+        vs=1000,
+        dim=25,
+        model_file="tests/data/en/en.wiki.bpe.vs1000.model",
+        emb_file="tests/data/en/en.wiki.dinosaur.vs1000.d25.w2v.bin",
+    )
+    with pytest.raises(FileNotFoundError):
+        BytePairFeaturizer(component_config=config_bad)
+
+
+def test_config_missing():
+    config_bad = dict(
+        lang="en",
+        vs=1000,
+        dim=25,
+        model_file="tests/data/en/en.wiki.bpe.vs1000.model",
+        emb_file="tests/data/en/en.wiki.dinosaur.vs1000.d25.w2v.bin",
+    )
+    with pytest.raises(FileNotFoundError):
+        BytePairFeaturizer(component_config=config_bad)
+
+
+@pytest.mark.parametrize(
+    "conf", [dict(lang="en", vs=1000), dict(lang="en", dim=25), dict(dim=25, vs=1000)]
+)
+def test_raise_missing_error(conf):
+    with pytest.raises(ValueError):
+        BytePairFeaturizer(component_config=conf)
diff --git a/tests/test_featurizers/test_fasttext_featurizer.py b/tests/test_featurizers/test_fasttext_featurizer.py
@@ -26,3 +26,18 @@ def test_model_loaded():
 )
 def test_featurizer_checks(test_fn, tok, feat, msg):
     test_fn(tok, feat, msg)
+
+
+def test_raise_cachedir_error():
+    bad_folder = str(test_folder / "foobar")
+    with pytest.raises(FileNotFoundError):
+        FastTextFeaturizer(
+            component_config={"cache_dir": bad_folder, "file": file_name}
+        )
+
+
+def test_raise_file_error():
+    with pytest.raises(FileNotFoundError):
+        FastTextFeaturizer(
+            component_config={"cache_dir": test_folder, "file": "dinosaur.bin"}
+        )