-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassifier.py
318 lines (259 loc) · 10.9 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
from numpy import *
from scipy import *
from numpy import linalg as LA
from matplotlib import pyplot as plt
import scipy.io as io
from scipy.sparse import *
from time import *
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn import linear_model
# The whole classifier is a class .. It will be easy to import ..
class FMRIWordClassifier:
def __init__(self):
self.fmri_train = None
self.fmri_test = None
self.wordid_train = None
self.wordid_test = None
self.wfc = None
self.wfs = None
self.MAX_RR = 200
self.sf = 1
self.precision = .001
self.lamb = 50
# Fill up all of the above ..
self.read_data()
def setup_reverse_lasso(self,lamb):
w_init = zeros(218+1)
X_L = [self.wfs[i-1] for i in self.wordid_train[:,0]]
X = array(X_L)
final_weight_vector = zeros((21764,219))
for i in range(0,21764):
t1 = time()
# This call will return 1()X21764 vector
weight_vector_i = self.solve_lasso(lamb,X,self.fmri_train,300,218,w_init,i)
final_weight_vector[i,:] = weight_vector_i
t2 = time()
print "Iteration {0} took about: ".format(i), t2-t1
#print shape(final_weight_vector)
# Write to a file
# But, first convert to sparse format, ensure the precision and field is set
sparse_weight_vector = coo_matrix(final_weight_vector)
io.mmio.mmwrite('weight_vec_rev{0}.out'.format(lamb),sparse_weight_vector,field='real',precision=25)
#sparse_weight_vector_fromdisk = io.mmio.mmread('weight_vec.out')
def setup_for_lasso(self,lamb):
# Setup to Solve Lasso - add the extra term w)
w_init = zeros(21764+1)
# The word id numbers range from 1 to 60 - we need to subtract by 1 to get the id from wfs
# Y_L is actually not going to just be the ID, it'll be the entire 218 vector representing that word
Y_L = [self.wfs[i-1] for i in self.wordid_train[:,0]]
Y = array(Y_L)
#io.mmio.mmwrite('y_out_wfs_train.out',Y,field='real',precision=25)
# In order to append the 218x21764 vector
final_weight_vector = zeros((218,21765))
for i in range(0,218):
t1 = time()
# This call will return 1()X21764 vector
weight_vector_i = self.solve_lasso(lamb,self.fmri_train,Y,240,21764,w_init,i)
final_weight_vector[i,:] = weight_vector_i
t2 = time()
print "Iteration {0} took about: ".format(i), t2-t1
#print shape(final_weight_vector)
# Write to a file
# But, first convert to sparse format, ensure the precision and field is set
sparse_weight_vector = coo_matrix(final_weight_vector)
io.mmio.mmwrite('new_weight_vec{0}.out'.format(lamb),sparse_weight_vector,field='real',precision=25)
#sparse_weight_vector_fromdisk = io.mmio.mmread('weight_vec.out')
# Read all the data in ..
def read_data(self):
# 300 X 21764
self.fmri_train = io.mmio.mmread("fmri_train_240.out.mtx")
# 60 X 21764
self.fmri_test = io.mmio.mmread("fmri_train_60.out.mtx")
# NOT WORKING - Reading as regular txt
#self.wordid_train = io.mmio.mmread("fmri/subject1_wordid.train.mtx")
# 300 X 1
#self.wordid_train = loadtxt("fmri/subject1_wordid.train.mtx",dtype=int)
#self.wordid_train = self.wordid_train.reshape((300,1))
self.wordid_train = io.mmio.mmread("wordid_train_240.out.mtx")
self.wordid_test = io.mmio.mmread("wordid_train_60.out.mtx")
# 60 X 2
#self.wordid_test = io.mmio.mmread("fmri/subject1_wordid.test.mtx")
# 60 X 218
self.wfc = io.mmio.mmread("fmri/word_feature_centered.mtx")
# 60 X 218
self.wfs = io.mmio.mmread("fmri/word_feature_std.mtx")
# What we're trying to do here is to get a 21764 weight vector for each
# semantic feature. There are 218 features, so we'll have a large 218 by
# 21764 matrix somewhere. To calculate the weight of a
# perspective Yij where i is the word id and j is the semantic feature, we
# will take the dot product of that 21764 weight vector with the input
# provided to indicate the score for that feature. The final output will be
# a 218 element array each filled with the score for the respective feature.
def solve_lasso(self,lamb,X,Y,N,d,init_w,y_idx):
# Initialize the w to be 0
w_tilde = copy(init_w) ;
w_prev = zeros(d+1)
# Set round robin to be 0
round_robin = 0
# Set w_0 to be zero
w_0 = init_w[0]
# Precompute Xij^2
X_squared = X**2
a_j_pre = X_squared.sum(axis=0) * 2
new_error = 0
Xw = X.dot(transpose(w_tilde[1:]))
while(round_robin < self.MAX_RR):
# Copy the entire vector coming in
w_prev = copy(w_tilde)
for voxel_j in range(0,d):
c_j = 2 * X[:,voxel_j].dot(Y[:,y_idx] -Xw + (X[:,voxel_j] * w_tilde[voxel_j+1]) - w_0)
w_tilde_j_old = w_tilde[voxel_j+1]
if(c_j < (-1 * lamb)):
w_tilde[voxel_j+1] = (c_j + lamb)/a_j_pre[voxel_j]
elif(c_j > lamb):
w_tilde[voxel_j+1] = (c_j - lamb)/a_j_pre[voxel_j]
else:
w_tilde[voxel_j+1] = 0
delta_w = w_tilde[voxel_j+1] - w_tilde_j_old
Xw += delta_w * X[:,voxel_j]
#Recalculate w_0
w_0 = sum(Y[:,y_idx] - X.dot(w_tilde[1:]))/N
w_tilde[0] = w_0
diff_w = w_tilde - w_prev
new_error = max(abs(diff_w))
#print new_error
round_robin += 1
if new_error < self.precision:
print("Done in ", round_robin)
return w_tilde
print("Not done after MAX Iterations - Exiting!")
return w_tilde
def SolveFastLasso(self,trainOrTest=0,dist=0):
# Train
Y_L = [self.wfs[i-1] for i in self.wordid_train[:,0]]
Y = array(Y_L)
lasso = linear_model.Lasso(alpha=25.0)
lasso.fit(self.fmri_train,Y)
w_0 = lasso.intercept_
a = lasso.coef_
if(trainOrTest == 1):
Y_sol = self.wordid_test[:,0]
else:
Y_sol = self.wordid_train[:,0]
mistakes = 0
# Iterate over the total number of 60 words to see
# which word gives us the smallest L2 Dist from our
# computed semantic vector. The semantic vector
# gets computed within the loop/
if(trainOrTest == 1):
CompVector = self.fmri_test
else:
CompVector = self.fmri_train
for i in range(shape(CompVector)[0]):
# Test against one row at a time.
b = CompVector[i]
# Compute the semantic vector
semantic_vec = a.dot(b)
#print shape(semantic_vec)
semantic_vec += w_0
# Compare against all candidate words
if(self.CalculateL2DistAgainstTestSet(semantic_vec,Y_sol,i,dist)):
mistakes += 1
print "Total Mistakes: ", mistakes
def CalcSemanticFeatureVector(self,trainOrTest=1,fread='weight_vec0.out.mtx',dist=0):
# Read the input weight_vector
weight_vector = io.mmio.mmread(fread)
weight_vector_dense = weight_vector.todense()
a = weight_vector_dense[:,1:]
'''
b = array(weight_vector_dense[0,1:])
print shape(b)
print "Nonzero elements", nonzero(b)[1]
cnt = shape(nonzero(b))[1]
arr = zeros(cnt)
for i in range(cnt):
arr[i] = b[nonzero(b)[0][i], nonzero(b)[1][i]]
print arr
'''
# Need to add w_0 to the final answer
w_0 = weight_vector_dense[:,0]
#print w_0
#return
# Get the list of candidate words - We'll pass this in to to the function
# that does L2 distance computation.
# 1 for trainOrTest implies TEST
if(trainOrTest == 1):
Y = self.wordid_test[:,0]
else:
Y = self.wordid_train[:,0]
mistakes = 0
#print w_0
# Iterate over the total number of 60 words to see
# which word gives us the smallest L2 Dist from our
# computed semantic vector. The semantic vector
# gets computed within the loop/
if(trainOrTest == 1):
CompVector = self.fmri_test
else:
CompVector = self.fmri_train
for i in range(shape(CompVector)[0]):
# Test against one row at a time.
b = transpose(CompVector[i])
# Compute the semantic vector
semantic_vec = a.dot(b)
#print shape(semantic_vec)
semantic_vec += transpose(w_0)
# Compare against all candidate words
if(self.CalculateL2DistAgainstTestSet(semantic_vec,Y,i,dist)):
mistakes += 1
print "Total Mistakes: ", mistakes
# This routine measures the l2 or cosine difference between
# the semantic vector and all 60 words, and tries to guess a match
# based on the user preference for dist metric.
# dist = 0 is eucld, 1 is cosine
def CalculateL2DistAgainstTestSet(self, semantic_vec, Y, expected_idx, dist_metric=0):
# Will hold the distance metric idx
compIndex = 0
l2dist = euclidean_distances(semantic_vec,self.wfs)
l2idx = argmin(l2dist)
# Also compute the cosine similarity matrix - this one seems to give better results.
cossim = cosine_similarity(semantic_vec,self.wfs)
cosidx = argmax(cossim)
if(dist_metric == 0):
# L2 Distances
compIndex = l2idx
else:
compIndex = cosidx
print "Expected Word Index: ", Y[expected_idx]-1, " | CosIDX: ", cosidx, " | EucIdx", l2idx
if(compIndex != (Y[expected_idx]-1)):
return True
# No Mistake
return False
def main():
classifier = FMRIWordClassifier()
'''
for i in [30,35,40,0]:
classifier.setup_for_lasso(i)
print "Done with lambda ", i
'''
# Run for lambda == 0
# This will store the a file named weight_vector0.out.mtx on your disk containing the resulting matrix
#classifier.setup_for_lasso(50)
#classifier.setup_reverse_lasso(25)
#classifier.SolveFastLasso(trainOrTest=0,dist=0)
# 0 to test on training samples(300), #1 to test on testing samples
# dist == 0 for l2, dist == 1 for cosine
classifier.CalcSemanticFeatureVector(trainOrTest=0,fread='new_weight_vec50.out.mtx',dist=1)
'''
X = array([[0,0],[1,1],[2,2]])
Y = array([[0],[1],[2]])
N = 3
d = 2
init_w = zeros(3)
y_idx=0
ret = classifier.solve_lasso(0.1,X,Y,N,d,init_w,y_idx)
print ret
'''
if __name__ == '__main__':
main()