-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_content_info.py
211 lines (196 loc) · 7.82 KB
/
test_content_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import imdb
import nltk
import urllib
import difflib
import re
import sqlite3
from whoosh.index import create_in
from whoosh.fields import *
from whoosh.qparser import *
from whoosh.index import open_dir
from whoosh import scoring
def init_whoosh():
#Database
schema = Schema(id=NUMERIC(stored=True), content=TEXT)
ix = create_in("indexdir", schema)
writer = ix.writer()
f = open('u.item', 'r')
searcher = imdb.IMDb()
id = 1
for line in f:
new_line = line.replace('\n', '').split('|')
if len(new_line) > 1:
movie_id = int(new_line[0])
movie_name = new_line[1]
s_result = searcher.search_movie(movie_name)
the_movie = s_result[0]
searcher.update(the_movie)
print id, '- I\'M GETTING THE PLOT OF THE MOVIE:', movie_name
#print 'ESTE MOVIE TEM AS KEYS', the_movie.keys()
if 'plot' in the_movie.keys():
#print 'THE MOVIE[PLOT] IS', the_movie['plot']
plot = the_movie['plot'][0]
elif 'plot outline' in the_movie.keys():
plot_list = the_movie['plot outline']
if len(plot_list) > 0:
#print 'THE MOVIE[PLOT OUTLINE] IS', the_movie['plot outline']
plot = the_movie['plot outline'][0]
else:
plot = movie_name.decode('unicode-escape')
else:
plot = movie_name.decode('unicode-escape')
movie_synop = nltk.word_tokenize(plot)
movie_synop_tagged = nltk.pos_tag(movie_synop)
new_synop = ''
for item in movie_synop_tagged:
if item[1] == 'CD' or item[1] == 'FW' or item[1] == 'LS' or item[1] == 'NN' or item[1] == 'NNP' or item[1] == 'NNS' or item[1] == 'NNPS' or item[1] == 'SYM' or item[1] == 'UH':
new_synop += item[0]
new_synop += ' '
if new_synop != '':
synop_list = re.split('\W+', new_synop.lower(), flags=re.UNICODE)
final_synop = ' '.join(synop_list)
print 'IM INSERTING THE SYNOPSIS:', final_synop
writer.add_document(id=movie_id, content=final_synop)
id += 1
writer.commit()
if __name__ == '__main__':
#init_whoosh()
print 'lol'
# db = sqlite3.connect('mydb')
# cursor = db.cursor()
#
# schema = Schema(id=NUMERIC(stored=True), content=TEXT(stored=True))
# ix = create_in("indexdir", schema)
# writer = ix.writer()
# movie_id = 1331
#
# synop_test = u'The Last Klezmer: Leopold Kozlowski, His Life and Music'
# print 'IM INSERTING THE SYNOPSIS:', synop_test
# writer.add_document(id=movie_id, content=synop_test)
# cmd = """INSERT INTO synops(item_id, synopsys) VALUES("%d", "%s")""" % (movie_id, synop_test)
# cursor.execute(cmd)
# movie_im_URL = 'http://ia.media-imdb.com/images/M/MV5BMTk2NjAwNTM1Ml5BMl5BanBnXkFtZTcwNDEwNDkyMQ@@._V1__SX1305_SY580_.jpg'
# cmd2 = """INSERT INTO movie_images(item_id, image_URL) VALUES("%d", "%s")""" % (movie_id, movie_im_URL)
# cursor.execute(cmd2)
# cmd3 = """INSERT INTO top_actors(item_id, actor_1, actor_2, actor_3) VALUES("%d", "%s", "%s", "%s")""" % (1331, 'Leopold Kozlowski', 'no_actor', 'no_actor')
# cursor.execute(cmd3)
#
# db.commit()
#
# cursor.close()
# db.close()
#
# writer.commit()
# searcher = imdb.IMDb()
#
# # Search for a movie (get a list of Movie objects).
# #s_result = searcher.search_movie('Wolf of Wall Street (2013)')
# s_result = searcher.search_movie('Last Klezmer: Leopold Kozlowski, His Life and Music, The')
#
# print 'O S_RESULT E\'', s_result
# # Print the long imdb canonical title and movieID of the results.
# #for item in s_result:
# # print item['long imdb canonical title'], item.movieID
#
# # Retrieves default information for the first result (a Movie object).
# the_wolf = s_result[0]
#
# searcher.update(the_wolf)
# print 'THE MOVIE KEYS ARE', the_wolf.keys()
#
# akas = the_wolf['akas']
# print 'THE AKAS IS', akas
#
# kind = the_wolf['kind']
# print 'THE KIND IS', kind
#
# cast = the_wolf['cast']
# print 'THE CAST IS', cast
#
# print 'TOP1 CAST IS', cast[0]
# print 'TOP2 CAST IS', cast[1]
# print 'TOP3 CAST IS', cast[2]
#
# director = the_wolf['director']
# print 'THE director IS', director
#
# genres = the_wolf['genres']
# print 'THE genres IS', genres
#
# img_url = the_wolf['full-size cover url']
# print 'THE IMG URL IS', img_url
#
# urllib.urlretrieve(img_url, "movie_cover.jpg")
# #img = urllib2.urlopen(img_url).read()
# # Print some information.
# print 'Runtime:', the_wolf['runtime'][0], 'min.'
# print 'Rating:', the_wolf['rating']
# director = the_wolf['director'] # get a list of Person objects.
# print 'Director:', director[0]
#
# #print 'Plot outline:', the_wolf['plot outline']
# plot = the_wolf['plot'][0]
# #print 'Plot:', plot
#
# movie_synop = nltk.word_tokenize(plot)
# #print 'THE MOVIE SYNOP IS', movie_synop
# movie_synop_tagged = nltk.pos_tag(movie_synop)
# #print 'THE TAGGED MOVIE SYNOP IS', movie_synop_tagged
# new_synop = ''
#
# for item in movie_synop_tagged:
# if item[1] == 'CD' or item[1] == 'FW' or item[1] == 'LS' or item[1] == 'NN' or item[1] == 'NNP' or item[1] == 'NNS' or item[1] == 'NNPS' or item[1] == 'SYM' or item[1] == 'UH':
# new_synop += item[0]
# new_synop += ' '
#
# #print 'THE NEW PLOT IS', new_synop
# new_synop = 'boy Andy toys doll "Woody" life Woody toy birthday party figure Buzz Lightyear toy killer Sid Phillips.'
#
# # s_result = searcher.search_movie('Wall Street')
# # the_movie = s_result[0]
# #
# # searcher.update(the_movie)
# # # Print some information.
# # #print 'Runtime:', the_movie['runtime'][0], 'min.'
# # #print 'Rating:', the_movie['rating']
# # director1 = the_movie['director'] # get a list of Person objects.
# # #print 'Director:', director1[0]
# #
# # #print 'Plot outline:', the_movie['plot outline']
# # plot_2 = the_movie['plot'][0]
# # #print 'Plot:', plot_2
# #
# # movie_synop_2 = nltk.word_tokenize(plot_2)
# # #print 'THE MOVIE SYNOP IS', movie_synop
# # movie_synop_tagged_2 = nltk.pos_tag(movie_synop_2)
# # #print 'THE TAGGED MOVIE SYNOP2 IS', movie_synop_tagged_2
# # new_synop_2 = ''
# #
# # for item in movie_synop_tagged_2:
# # if item[1] == 'CD' or item[1] == 'FW' or item[1] == 'LS' or item[1] == 'NN' or item[1] == 'NNP' or item[1] == 'NNS' or item[1] == 'NNPS' or item[1] == 'SYM' or item[1] == 'UH':
# # new_synop_2 += item[0]
# # new_synop_2 += ' '
# #
# # print 'THE NEW PLOT2 IS', new_synop_2
# #
# # #new_synop_2 = new_synop
# #
# # seq = difflib.SequenceMatcher(a=new_synop.lower(), b=new_synop_2.lower())
# # print 'THE SYNOPSIS SIMILARITY IS', seq.ratio()
#
# ix = open_dir("indexdir")
# queryResults = []
#
# with ix.searcher(weighting=scoring.TF_IDF()) as searcher:
# print 'ESTOU A FAZER A QUERY COM A SYNOP', new_synop
# query = QueryParser("content", ix.schema, group=OrGroup).parse(new_synop)
# results = searcher.search(query, limit=100)
# print 'O TOTAL RESUTLS E\'', results
# for r in results:
# queryResults.append(r['id'])
# if __name__ == "__main__":
# print 'O SCORE ORIGINAL E\'', r.score
# new_score = (r.score + 0.0) / (r.score + 20)
# print 'PARA O DOCNUM', r['id'], 'O SCORE DO RESULT E\'', new_score
# if __name__ == "__main__":
# print "Number of results:", results.scored_length()