Merge branch 'classify_view'

eea · Sep 9, 2017 · 2788c47 · 2788c47
2 parents 5fcd326 + 35d1033
commit 2788c47
Show file tree

Hide file tree

Showing 12 changed files with 344 additions and 31 deletions.
diff --git a/src/eea.corpus/eea/corpus/__init__.py b/src/eea.corpus/eea/corpus/__init__.py
@@ -36,6 +36,7 @@ def main(global_config, **settings):
     config.add_route('corpus_topics', '/topics/{doc}/{corpus}')
     config.add_route('delete_corpus', '/delete/{doc}/{corpus}')
     config.add_route('process_csv', '/process/{doc}/')
+    config.add_route('corpus_classify', '/classify/{doc}/{corpus}')
     config.add_route('view_job', '/job-view/{doc}/{corpus}/job/{job}')
     config.add_route('demo', '/demo')
 

diff --git a/src/eea.corpus/eea/corpus/classify.py b/src/eea.corpus/eea/corpus/classify.py
@@ -0,0 +1,83 @@
+class ClassVocab:
+    def __init__(self):
+        self.vocab = {}
+
+    def __getitem__(self, k):
+        if isinstance(k, float):
+            k = 'empty'
+        k = [x for x in k.split('/') if x][0]
+        if k not in self.vocab:
+            x = len(self.vocab)
+            self.vocab[k] = x
+            return x
+        return self.vocab[k]
+
+
+def train_model(corpus):
+    # conventions: X are features, y are labels
+    # X_train is array of training feature values,
+    # X_test is array with test values
+    # y_train are labels for X_train, y_test are labels for X_test
+
+    from sklearn import metrics
+    from sklearn.model_selection import train_test_split
+    from itertools import tee
+
+    docs = (doc for doc in corpus
+            if not isinstance(doc.metadata['Category Path'], float))
+    docs_stream, meta_stream = tee(docs, 2)
+
+    print("Transforming docs")
+    docs = [doc.text for doc in docs_stream]
+
+    from sklearn.feature_extraction.text import CountVectorizer
+    vect = CountVectorizer(input='content', strip_accents='unicode',
+                            tokenizer=tokenizer,  # stop_words='english',
+                            max_features=5000)
+
+    X = vect.fit_transform(docs)
+
+    from sklearn.feature_extraction.text import TfidfTransformer
+    transf = TfidfTransformer()
+    X = transf.fit_transform(X)
+    # X = X.toarray()   # only needed for GDC
+
+    # from sklearn.feature_extraction.text import TfidfVectorizer
+    # vect = TfidfVectorizer(max_features=5000,
+    #                        ngram_range=(1, 3), sublinear_tf=True)
+    # X = vect.fit_transform(docs)
+
+    # from sklearn.ensemble import RandomForestClassifier
+    # model = RandomForestClassifier(n_estimators=100)    # acc: 0.73
+
+    # from sklearn import svm
+    # model = svm.SVC(kernel='poly', degree=3, C=1.0)     # acc: 0.66
+
+    # from sklearn.naive_bayes import MultinomialNB       # acc: 0.73
+    # model = MultinomialNB(alpha=0.1)        # , fit_prior=True
+
+    # takes a long time, can go higher if more estimators, higher l_rate
+    # from sklearn.ensemble import GradientBoostingClassifier   # acc: 0.65
+    # model = GradientBoostingClassifier(n_estimators=10,learning_rate=0.1)
+
+    # 0.763 with tfidf from countvect 5000, 0.7 without tfidf
+    from sklearn.linear_model import LogisticRegression
+    model = LogisticRegression()
+
+    vocab = ClassVocab()
+    y = [vocab[doc.metadata['Category Path']] for doc in meta_stream]
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.1, random_state=3311)
+
+    print("Training on %s docs" % str(X_train.shape))
+
+    model.fit(X_train, y_train)
+
+    print("Fitting model")
+    model.fit(X_train, y_train)
+    print("done")
+
+    pred = model.predict(X_test)
+    score = metrics.accuracy_score(y_test, pred)
+    print(score)
diff --git a/src/eea.corpus/eea/corpus/classify/__init__.py b/src/eea.corpus/eea/corpus/classify/__init__.py
diff --git a/src/eea.corpus/eea/corpus/classify/views.py b/src/eea.corpus/eea/corpus/classify/views.py
@@ -0,0 +1,105 @@
+from eea.corpus.schema import ClassifficationModelSchema
+from eea.corpus.utils import tokenizer
+from eea.corpus.corpus import get_corpus
+from pyramid.view import view_config
+from pyramid_deform import FormView
+import pyramid.httpexceptions as exc
+
+
+@view_config(route_name="corpus_classify",
+             renderer='eea.corpus:templates/classify.pt')
+class CreateClassificationModelView(FormView):
+    schema = ClassifficationModelSchema()
+    buttons = ('classify', 'fasttext')
+
+    score = None
+
+    def corpus(self):
+        """ Return a corpus based on environment.
+
+        It will try to return it from cache, otherwise load it from disk.
+        If corpus hasn't been extracted from the document, it will redirect to
+        a corpus creation tool.
+        """
+
+        corpus = get_corpus(self.request)
+        if corpus is None:
+            raise exc.HTTPNotFound()
+        return corpus
+
+    def metadata(self):
+        """ Show metadata about context document
+        """
+        # TODO: show info about processing and column
+        corpus = self.corpus()
+        return {
+            'docs': corpus.n_docs,
+            'sentences': corpus.n_sents,
+            'tokens': corpus.n_tokens,
+            'lang': corpus.lang,
+        }
+
+    def classify_success(self, appstruct):
+        corpus = self.corpus()
+        pass
+
+    def fasttext_success(self, appstruct):
+        from itertools import islice
+        # from pyfasttext import FastText
+
+        corpus = self.corpus()
+        docs = [doc for doc in corpus
+                if not isinstance(doc.metadata['Category Path'], float)]
+
+        split = int(corpus.n_docs * 0.9)        # TODO: should be docs
+
+        train_docs = islice(docs, 0, split)
+        test_docs = islice(docs, split, corpus.n_docs)
+
+        print('Writing corpus to disk')
+        lines = []
+        for doc in train_docs:
+            labels = doc.metadata['Category Path'].replace('/', ' __label__')
+            labels = labels.strip()
+            # labels = '__label__'+doc.metadata['Category Path'].split('/')[1]
+            text = doc.text.replace('\n', ' ')
+            line = " ".join([labels, text])
+            lines.append(line)
+
+        import unicodedata
+        with open('/tmp/corpus-train.txt', 'wb') as f:
+            s = '\n'.join(lines)
+            s = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
+            f.write(s)
+
+        y_test = []
+        test_lines = []
+        with open('/tmp/corpus-test.txt', 'w') as f:
+            for doc in test_docs:
+                labels = [x for x in doc.metadata['Category Path'].split('/')
+                          if x]
+                # labels = '__label__' + \
+                #     doc.metadata['Category Path'].split('/')[1]
+                test_lines.append(doc.text.replace('\n', ' '))
+                y_test.append(labels)
+            f.write('\n'.join(test_lines))
+
+        print("Training model")
+        # model = fasttext.supervised()
+        import fasttext as ft
+        model = ft.supervised(input_file='/tmp/corpus-train.txt',
+                              output='/tmp/ftmodel', epoch=100)
+        print("Model trained")
+
+        # from sklearn import metrics
+        # self.score = metrics.accuracy_score(y_test, pred)
+
+        pred = model.predict(test_lines, k=2)
+        zz = list(zip(pred, y_test))
+        tt = [x for x in zz if set(x[0]) != set(x[1])]
+        notok = len(tt)
+        self.score = notok * 100 / len(zz)
+        print("Score %s" % self.score)
+
+        # xx = model.predict_proba(test_lines, k=2)
+        # import pdb; pdb.set_trace()
diff --git a/src/eea.corpus/eea/corpus/processing/simpletokenizer.py b/src/eea.corpus/eea/corpus/processing/simpletokenizer.py
@@ -0,0 +1,28 @@
+from colander import Schema
+from eea.corpus.processing import pipeline_component
+from eea.corpus.utils import set_text
+import logging
+
+logger = logging.getLogger('eea.corpus')
+
+class Tokenizer(Schema):
+    """ Schema for the Tokenizer processing.
+    """
+
+    description = "Simple, dumb tokenizer. Strips non-alpha and small words"
+
+
+@pipeline_component(schema=Tokenizer,
+                    title="Simple text tokenization")
+def process(content, env, **settings):
+    """ Tokenization
+    """
+
+    for doc in content:
+        text = " ".join(tokenizer(doc.text))
+
+        try:
+            yield set_text(doc, text)
+        except Exception:
+            logger.exception("Error in converting to Doc %r", text)
+            continue
diff --git a/src/eea.corpus/eea/corpus/processing/stopwords.py b/src/eea.corpus/eea/corpus/processing/stopwords.py
@@ -10,9 +10,12 @@
 
 logger = logging.getLogger('eea.corpus')
 
-dl = nltk.downloader.Downloader()
-if not dl.is_installed('stopwords'):
-    nltk.download('stopwords')      # TODO: do this some other way
+try:
+    dl = nltk.downloader.Downloader()
+    if not dl.is_installed('stopwords'):
+        nltk.download('stopwords')      # TODO: do this some other way
+except Exception:
+    logger.exception("Error when checking for nltk's stopwords data")
 
 
 class StopWords(Schema):

diff --git a/src/eea.corpus/eea/corpus/schema.py b/src/eea.corpus/eea/corpus/schema.py
@@ -1,4 +1,4 @@
-from colander import Int, Schema, SchemaNode, String, Float     # , Bool
+from colander import Int, Schema, SchemaNode, String, Float, Set
 from eea.corpus.processing import pipeline_registry
 from eea.corpus.utils import upload_location
 import colander
@@ -14,27 +14,44 @@ def preview_url(self, name):
 tmpstore = Store()
 
 
+def csv_file_columns(request):
+    md = request.matchdict or {}
+    name = md.get('doc')
+    if name:
+        path = upload_location(name)        # TODO: move this to utils
+        f = pd.read_csv(path)
+
+    return [(k, k) for k in f.keys()]
+
+
 @colander.deferred
 def columns_widget(node, kw):
     """ A select widget that reads the csv file to show available columns
     """
 
-    choices = []
     req = kw['request']
-
-    md = req.matchdict or {}
-    name = md.get('doc')
-    if name:
-        path = upload_location(name)        # TODO: move this to utils
-        f = pd.read_csv(path)
-        choices = [('', '')] + [(k, k) for k in f.keys()]
+    choices = [('', '')] + csv_file_columns(req)
 
     return deform.widget.SelectWidget(
         values=choices,
         default=''
     )
 
 
+@colander.deferred
+def multi_columns_widget(node, kw):
+    """ A multiselect widget that reads the csv file to show available columns
+    """
+
+    req = kw['request']
+    choices = csv_file_columns(req)
+
+    return deform.widget.SelectWidget(
+        values=choices,
+        multiple=True
+    )
+
+
 class UploadSchema(Schema):
     # title = SchemaNode(String())
     upload = SchemaNode(
@@ -147,3 +164,14 @@ class CreateCorpusSchema(colander.MappingSchema):
         widget=pipeline_components_widget,
         title="Add a new pipeline component"
     )
+
+
+class ClassifficationModelSchema(colander.MappingSchema):
+    """ Schema to build a text classification modle
+    """
+
+    columns = SchemaNode(
+        Set(),
+        widget=multi_columns_widget,
+        title='Columns with class labels',
+    )
diff --git a/src/eea.corpus/eea/corpus/templates/classify.pt b/src/eea.corpus/eea/corpus/templates/classify.pt
@@ -0,0 +1,37 @@
+<div metal:use-macro="load: layout.pt">
+
+  <div metal:fill-slot="sidebar-top"></div>
+  <div metal:fill-slot="sidebar"></div>
+
+  <div metal:fill-slot="full-width">
+    <div class="row">
+      <div class="col-md-12">
+        <h1>Create a new classification model</h1>
+        <p>Set required parameters for the classification model</p>
+      </div>
+    </div>
+
+    <div class="row">
+
+      <div class="col-md-5" id="corpus-form">
+        <form tal:replace="structure form" ></form>
+      </div>
+
+      <div class="col-md-7">
+        <div class="content">
+          <h4>Preview result</h4>
+          <p>Choose row and processing settings to preview results</p>
+
+          <div class="panel panel-default">
+            <div class="panel-body">
+              Score: ${view.score}
+            </div>
+          </div>
+
+        </div>
+      </div>
+
+    </div>
+  </div>
+</div>
+
diff --git a/src/eea.corpus/eea/corpus/templates/home.pt b/src/eea.corpus/eea/corpus/templates/home.pt
@@ -44,7 +44,7 @@
                   </h4>
                   <a href="/topics/${doc.name}/${corpus[0]}" >Topic Modeling</a>
                   /
-                  <a href="/categorize/${doc.name}/${corpus[0]}">Categorize</a>
+                  <a href="/classify/${doc.name}/${corpus[0]}">Classification</a>
                   /
                   <a href="/view/${doc.name}/${corpus[0]}/0">View</a>
 

diff --git a/src/eea.corpus/eea/corpus/tests/test_utils.py b/src/eea.corpus/eea/corpus/tests/test_utils.py
@@ -42,24 +42,6 @@ def test_valid_document_name(self, is_valid_document):
         req.matchdict = {'doc': 'first'}
         is_valid_document.return_value = True
         assert document_name(req) == 'first'
-    #
-    # def test_is_safe_to_save(self):
-    #     from eea.corpus.utils import is_safe_to_save
-    #     from pandas import read_csv
-    #     from pkg_resources import resource_filename
-    #     from textacy.doc import Doc
-    #
-    #     fpath = resource_filename('eea.corpus', 'tests/fixtures/broken.csv')
-    #     text_col = read_csv(fpath)['text']
-    #
-    #     assert is_safe_to_save(Doc(text_col[0], lang='en')) is True
-    #     assert is_safe_to_save(Doc(text_col[1], lang='en')) is True
-    #     assert is_safe_to_save(Doc(text_col[0], lang='en')) is True
-    #     assert is_safe_to_save(Doc(text_col[1], lang='en')) is True
-    #     assert is_safe_to_save(Doc(text_col[0], lang='en')) is True
-    #     assert is_safe_to_save(Doc(text_col[1], lang='en')) is True
-    #     assert is_safe_to_save(Doc(text_col[0], lang='en')) is True
-    #     assert is_safe_to_save(Doc(text_col[1], lang='en')) is True
 
 
 class TestConvertorDecorators:

diff --git a/src/eea.corpus/eea/corpus/tests/test_views.py b/src/eea.corpus/eea/corpus/tests/test_views.py
@@ -49,3 +49,8 @@ def test_apply_schema_edits(self):
         ]
 
         # TODO: finish test
+
+
+class TestClassificiationView:
+    def test_schema(self):
+        pass