-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
12 changed files
with
344 additions
and
31 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
class ClassVocab: | ||
def __init__(self): | ||
self.vocab = {} | ||
|
||
def __getitem__(self, k): | ||
if isinstance(k, float): | ||
k = 'empty' | ||
k = [x for x in k.split('/') if x][0] | ||
if k not in self.vocab: | ||
x = len(self.vocab) | ||
self.vocab[k] = x | ||
return x | ||
return self.vocab[k] | ||
|
||
|
||
def train_model(corpus): | ||
# conventions: X are features, y are labels | ||
# X_train is array of training feature values, | ||
# X_test is array with test values | ||
# y_train are labels for X_train, y_test are labels for X_test | ||
|
||
from sklearn import metrics | ||
from sklearn.model_selection import train_test_split | ||
from itertools import tee | ||
|
||
docs = (doc for doc in corpus | ||
if not isinstance(doc.metadata['Category Path'], float)) | ||
docs_stream, meta_stream = tee(docs, 2) | ||
|
||
print("Transforming docs") | ||
docs = [doc.text for doc in docs_stream] | ||
|
||
from sklearn.feature_extraction.text import CountVectorizer | ||
vect = CountVectorizer(input='content', strip_accents='unicode', | ||
tokenizer=tokenizer, # stop_words='english', | ||
max_features=5000) | ||
|
||
X = vect.fit_transform(docs) | ||
|
||
from sklearn.feature_extraction.text import TfidfTransformer | ||
transf = TfidfTransformer() | ||
X = transf.fit_transform(X) | ||
# X = X.toarray() # only needed for GDC | ||
|
||
# from sklearn.feature_extraction.text import TfidfVectorizer | ||
# vect = TfidfVectorizer(max_features=5000, | ||
# ngram_range=(1, 3), sublinear_tf=True) | ||
# X = vect.fit_transform(docs) | ||
|
||
# from sklearn.ensemble import RandomForestClassifier | ||
# model = RandomForestClassifier(n_estimators=100) # acc: 0.73 | ||
|
||
# from sklearn import svm | ||
# model = svm.SVC(kernel='poly', degree=3, C=1.0) # acc: 0.66 | ||
|
||
# from sklearn.naive_bayes import MultinomialNB # acc: 0.73 | ||
# model = MultinomialNB(alpha=0.1) # , fit_prior=True | ||
|
||
# takes a long time, can go higher if more estimators, higher l_rate | ||
# from sklearn.ensemble import GradientBoostingClassifier # acc: 0.65 | ||
# model = GradientBoostingClassifier(n_estimators=10,learning_rate=0.1) | ||
|
||
# 0.763 with tfidf from countvect 5000, 0.7 without tfidf | ||
from sklearn.linear_model import LogisticRegression | ||
model = LogisticRegression() | ||
|
||
vocab = ClassVocab() | ||
y = [vocab[doc.metadata['Category Path']] for doc in meta_stream] | ||
|
||
X_train, X_test, y_train, y_test = train_test_split( | ||
X, y, test_size=0.1, random_state=3311) | ||
|
||
print("Training on %s docs" % str(X_train.shape)) | ||
|
||
model.fit(X_train, y_train) | ||
|
||
print("Fitting model") | ||
model.fit(X_train, y_train) | ||
print("done") | ||
|
||
pred = model.predict(X_test) | ||
score = metrics.accuracy_score(y_test, pred) | ||
print(score) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
from eea.corpus.schema import ClassifficationModelSchema | ||
from eea.corpus.utils import tokenizer | ||
from eea.corpus.corpus import get_corpus | ||
from pyramid.view import view_config | ||
from pyramid_deform import FormView | ||
import pyramid.httpexceptions as exc | ||
|
||
|
||
@view_config(route_name="corpus_classify", | ||
renderer='eea.corpus:templates/classify.pt') | ||
class CreateClassificationModelView(FormView): | ||
schema = ClassifficationModelSchema() | ||
buttons = ('classify', 'fasttext') | ||
|
||
score = None | ||
|
||
def corpus(self): | ||
""" Return a corpus based on environment. | ||
It will try to return it from cache, otherwise load it from disk. | ||
If corpus hasn't been extracted from the document, it will redirect to | ||
a corpus creation tool. | ||
""" | ||
|
||
corpus = get_corpus(self.request) | ||
if corpus is None: | ||
raise exc.HTTPNotFound() | ||
return corpus | ||
|
||
def metadata(self): | ||
""" Show metadata about context document | ||
""" | ||
# TODO: show info about processing and column | ||
corpus = self.corpus() | ||
return { | ||
'docs': corpus.n_docs, | ||
'sentences': corpus.n_sents, | ||
'tokens': corpus.n_tokens, | ||
'lang': corpus.lang, | ||
} | ||
|
||
def classify_success(self, appstruct): | ||
corpus = self.corpus() | ||
pass | ||
|
||
def fasttext_success(self, appstruct): | ||
from itertools import islice | ||
# from pyfasttext import FastText | ||
|
||
corpus = self.corpus() | ||
docs = [doc for doc in corpus | ||
if not isinstance(doc.metadata['Category Path'], float)] | ||
|
||
split = int(corpus.n_docs * 0.9) # TODO: should be docs | ||
|
||
train_docs = islice(docs, 0, split) | ||
test_docs = islice(docs, split, corpus.n_docs) | ||
|
||
print('Writing corpus to disk') | ||
lines = [] | ||
for doc in train_docs: | ||
labels = doc.metadata['Category Path'].replace('/', ' __label__') | ||
labels = labels.strip() | ||
# labels = '__label__'+doc.metadata['Category Path'].split('/')[1] | ||
text = doc.text.replace('\n', ' ') | ||
line = " ".join([labels, text]) | ||
lines.append(line) | ||
|
||
import unicodedata | ||
with open('/tmp/corpus-train.txt', 'wb') as f: | ||
s = '\n'.join(lines) | ||
s = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore') | ||
f.write(s) | ||
|
||
y_test = [] | ||
test_lines = [] | ||
with open('/tmp/corpus-test.txt', 'w') as f: | ||
for doc in test_docs: | ||
labels = [x for x in doc.metadata['Category Path'].split('/') | ||
if x] | ||
# labels = '__label__' + \ | ||
# doc.metadata['Category Path'].split('/')[1] | ||
test_lines.append(doc.text.replace('\n', ' ')) | ||
y_test.append(labels) | ||
f.write('\n'.join(test_lines)) | ||
|
||
print("Training model") | ||
# model = fasttext.supervised() | ||
import fasttext as ft | ||
model = ft.supervised(input_file='/tmp/corpus-train.txt', | ||
output='/tmp/ftmodel', epoch=100) | ||
print("Model trained") | ||
|
||
# from sklearn import metrics | ||
# self.score = metrics.accuracy_score(y_test, pred) | ||
|
||
pred = model.predict(test_lines, k=2) | ||
zz = list(zip(pred, y_test)) | ||
tt = [x for x in zz if set(x[0]) != set(x[1])] | ||
notok = len(tt) | ||
self.score = notok * 100 / len(zz) | ||
print("Score %s" % self.score) | ||
|
||
# xx = model.predict_proba(test_lines, k=2) | ||
# import pdb; pdb.set_trace() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
from colander import Schema | ||
from eea.corpus.processing import pipeline_component | ||
from eea.corpus.utils import set_text | ||
import logging | ||
|
||
logger = logging.getLogger('eea.corpus') | ||
|
||
class Tokenizer(Schema): | ||
""" Schema for the Tokenizer processing. | ||
""" | ||
|
||
description = "Simple, dumb tokenizer. Strips non-alpha and small words" | ||
|
||
|
||
@pipeline_component(schema=Tokenizer, | ||
title="Simple text tokenization") | ||
def process(content, env, **settings): | ||
""" Tokenization | ||
""" | ||
|
||
for doc in content: | ||
text = " ".join(tokenizer(doc.text)) | ||
|
||
try: | ||
yield set_text(doc, text) | ||
except Exception: | ||
logger.exception("Error in converting to Doc %r", text) | ||
continue |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
<div metal:use-macro="load: layout.pt"> | ||
|
||
<div metal:fill-slot="sidebar-top"></div> | ||
<div metal:fill-slot="sidebar"></div> | ||
|
||
<div metal:fill-slot="full-width"> | ||
<div class="row"> | ||
<div class="col-md-12"> | ||
<h1>Create a new classification model</h1> | ||
<p>Set required parameters for the classification model</p> | ||
</div> | ||
</div> | ||
|
||
<div class="row"> | ||
|
||
<div class="col-md-5" id="corpus-form"> | ||
<form tal:replace="structure form" ></form> | ||
</div> | ||
|
||
<div class="col-md-7"> | ||
<div class="content"> | ||
<h4>Preview result</h4> | ||
<p>Choose row and processing settings to preview results</p> | ||
|
||
<div class="panel panel-default"> | ||
<div class="panel-body"> | ||
Score: ${view.score} | ||
</div> | ||
</div> | ||
|
||
</div> | ||
</div> | ||
|
||
</div> | ||
</div> | ||
</div> | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.