-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinverted_idx.py
59 lines (51 loc) · 2.16 KB
/
inverted_idx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import glob
import math
import SetXML
def InvertedFile(file, inverted_index):
inv_list = []
f = open(file, "r")
first_list = f.readlines()
for item in first_list:
inv_list.append(item.strip("()\n").replace("\'", "").split(","))
f.close()
filename = file.replace("$", "/")
for word, tf in inv_list:
if word in inverted_index:
posting_list = inverted_index[word]
posting_list[filename[9:-4]] = tf
else:
inverted_index[word] = {filename[9:-4]: tf}
return inverted_index
def main(inverted_index):
vertex_counter = 0
for _ in glob.glob("vertices/http*.txt"):
vertex_counter = vertex_counter + 1
print("Creating index for " + str(vertex_counter) + " articles...")
"""
Tha arxisoume na diatrexoume ola ta dianusmata kai gia kathe kainourgio limma
pou tha sunantame tha kanoume eisagogi sto dictionary, diladi:
for filename in vertex_files:
for line in filename:
if lemma not in inverted_index:
kane tin eisagogi tou
else:
prosthese sta keimena toy limmatos kai to neo keimeno
tf-idf weighting: to tf to pairno kateutheian apo to dianusma, eno to idf mporo
na to ypologizo ek neou se kathe eisagogi. arkei na kratao se mia metavliti to plithos
olon ton keimenon kai na mporoume kai na metrisoume to mikos tou leksikou kathe limmatos,
diladi se posa keimena summetexei. Etsi se kathe nea eisagogi tha ananeonoume to varos
tou limmatos gia olo tou to leksiko.
"""
for filename in glob.glob("vertices/http*.txt"):
InvertedFile(filename, inverted_index)
for word in inverted_index:
postings_list = inverted_index[word]
idf = math.log10(vertex_counter / (len(inverted_index[word]) * 1.0))
for entry, weight in postings_list.iteritems():
postings_list[entry] = float(weight) * idf
xmlout = open("inverted_index.xml", "w")
txtout = open("inverted_index_text.txt", "w")
xmlout.write(SetXML.SetXML(inverted_index))
txtout.write(SetXML.SetXML(inverted_index))
xmlout.close()
return inverted_index