-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
90 lines (78 loc) · 2.51 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import numpy as np
import pandas as pd
import re
from sklearn.base import BaseEstimator, TransformerMixin
class TagCleaner(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
new = []
for row in X:
terms = [x.lower() for x in row]
terms_cleaned = []
for term in terms:
term = term.strip()
if 'foundation' in term:
pass
else:
if ' and ' in term:
sub = term.split(' and ')
for subby in sub:
terms_cleaned.append(subby)
elif ' but ' in term:
sub2 = term.split(' but ')
for subby2 in sub2:
terms_cleaned.append(subby2)
elif 'loves ' in term:
thing = re.sub('loves ', '', term)
terms_cleaned.append(thing)
elif 'super ' in term:
thing2 = re.sub('super ', '', term)
terms_cleaned.append(thing2)
else:
terms_cleaned.append(term)
new.append(terms_cleaned)
return pd.Series(new)
class DictEncoder(BaseEstimator, TransformerMixin):
""" Take a column of list of terms and turn it into a dictionary of term : counts """
def fit(self, X, y=None):
return self
def transform(self, X):
new = []
for row in X:
dd = dict()
if len(row) == 0:
dd['empty'] = 1
else:
for term in row:
term2 = term.lower()
dd[term2] = 1
new.append(dd)
return pd.Series(new)
# turn age into age_code
def CodeAge(age):
if age == "Puppy":
return 0
elif age == "Young":
return 1
elif age == "Adult":
return 2
else:
return 3
# turn size into size_code
def CodeSize(size):
if size == "0-25":
return 0
elif size == "26-60":
return 1
elif size == "61-100":
return 2
else:
return 3
# to true/false # should be able to delete this chunk and the code will be OK
def MixEncoder(br1, br2):
if br2 != "Mixed Breed" and (br1 == br2):
mix = 0
else:
mix = 1
return mix