-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWebCrawler.py
315 lines (249 loc) · 14 KB
/
WebCrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
import urllib.request
from bs4 import BeautifulSoup
import sys
import re
import urllib.parse
import hashlib
import string
import codecs
from nltk.stem import PorterStemmer
class WebCrawler:
def __init__(self, seed_url):
self.seed_url = seed_url
self.domain_url = "/".join(self.seed_url.split("/")[:3])
self.robots_txt = None
self.stop_words_file = None
self.page_limit = None
self.stop_words = [] # list of words to be ignored when processing documents
self.url_frontier = [] # list of urls not yet visited
self.visited_urls = {} # URL : (Title, DocumentID) (hash of content of a visited URL)
self.outgoing_urls = []
self.broken_urls = []
self.graphic_urls = []
self.all_terms = [] # set of all stemmed terms in all documents
self.frequency_matrix = [] # Term doc frequency matrix (row=term, col=doc)
self.num_pages_crawled = 0 # number of valid pages visited
self.num_pages_indexed = 0 # number of pages whose words have been stored
"""
note: the attributes below only contain information from those documents whose
words were saved (.txt, .htm, .html, .php)
"""
self.duplicate_urls = {} # DocumentID : [URLs that produce that ID]
self.doc_urls = {} # DocumentID: first URL that produces that ID
self.doc_titles = {} # DocumentID : title
self.doc_words = {} # DocumentID : [words]
# print the report produced from crawling a site
def __str__(self):
report = "\nPages crawled: " + str(self.num_pages_crawled) \
+ "\nPages indexed: " + str(self.num_pages_indexed) \
+ "\nVisited URLs: " + str(len(self.visited_urls)) \
+ "\n\nOutgoing URLs: " + "\n + " + "\n + ".join(self.outgoing_urls) \
+ "\n\nBroken URLs: " + "\n + " + "\n + ".join(self.broken_urls) \
+ "\n\nGraphic URLs: " + "\n + " + "\n + ".join(self.graphic_urls) \
+ "\n\nDuplicate URLs:\n"
# print duplicate urls
for key in range(len(self.duplicate_urls.keys())):
report += "\t + Doc" + str(key + 1) + ":\n"
for val in list(self.duplicate_urls.values())[key]:
report += "\t\t + " + val + "\n"
return report
'''
Returns a dictionary of allowed and disallowed urls
Adapted from https://stackoverflow.com/a/43086135/8853372
'''
def get_robots_txt(self):
# open seed url
result = urllib.request.urlopen(self.seed_url + "/robots.txt").read()
result_data_set = {"Disallowed": [], "Allowed": []}
# for reach line in the file
for line in result.decode("utf-8").split('\n'):
if line.startswith("Allow"):
result_data_set["Allowed"].append(
self.seed_url + line.split(": ")[1].split('\r')[0]
) # to neglect the comments or other junk info
elif line.startswith("Disallow"): # this is for disallowed url
result_data_set["Disallowed"].append(
self.seed_url + line.split(": ")[1].split('\r')[0]
) # to neglect the comments or other junk info
return result_data_set
def set_page_limit(self, limit):
self.page_limit = int(limit)
# sets the stop words list given a file with stop words separated by line
def set_stop_words(self, filepath):
try:
with open(filepath, "r") as stop_words_file:
stop_words = stop_words_file.readlines()
self.stop_words = [x.strip() for x in stop_words]
self.stop_words_file = filepath
except IOError as e:
print("Error opening" + filepath + " error({0}): {1}".format(e.errno, e.strerror))
except ValueError:
print("Error opening" + filepath + ": Data is not correctly formatted. See README.")
except:
print("Error opening" + filepath + "Unexpected error:", sys.exc_info()[0])
raise
'''
returns whether or not a url is valid
source: https://stackoverflow.com/a/7160778/8853372
'''
def url_is_valid(self, url_string):
pattern = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
r'localhost|' # localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$',
re.IGNORECASE)
return bool(pattern.match(url_string))
'''
returns whether or not a given word is valid
A word is a string of non-space characters, beginning with an alphabetic character.
It may contain special characters, but the last character of a word is either alphabetic or numeric.
'''
def word_is_valid(self, word):
pattern = re.compile(r'^[a-zA-z](\S*)[a-zA-z0-9]$')
return bool(pattern.match(word))
# returns whether or not the url is within the scope of the seed url
def url_is_within_scope(self, url_string):
return self.seed_url in url_string
'''
produces a list of duplicate documents
populates self.duplicate_urls with DocumentID : [URLs that produce that ID]
'''
def produce_duplicates(self):
duplicates = {}
# populate duplicates with DocumentID : [URLs]
for url, (_, docID) in self.visited_urls.items():
duplicates[docID] = [url] if duplicates.get(docID) is None else duplicates[docID] + [url]
# set duplicate_urls to those instances that have only one URL
self.duplicate_urls = {docID: urls for docID, urls in duplicates.items() if len(urls) > 1}
# crawls a site and returns a dictionary of information found
def crawl(self):
# dictionary containing information about the site
self.robots_txt = self.get_robots_txt()
print("robots.txt: " + " ".join("{}{}".format(key, [
v.replace(self.domain_url, "") for v in val]) for key, val in self.robots_txt.items()) + "\n")
self.url_frontier.append(self.seed_url + "/")
'''
pop from the URL frontier while the queue is not empty
links in queue are valid, full urls
'''
while self.url_frontier and (self.page_limit is None or self.num_pages_indexed < self.page_limit):
# current_page refers to the url of the current page being processed
current_page = self.url_frontier.pop(0) # select the next url
# calculate present working directory
pwd = "/".join(current_page.split("/")[:-1]) + "/"
if pwd not in self.robots_txt["Disallowed"]:
try:
# hit the current page
handle = urllib.request.urlopen(current_page)
# basic HTTP error e.g. 404, 501, etc
except urllib.error.HTTPError as e:
if current_page not in self.broken_urls and current_page is not None:
self.broken_urls.append(current_page)
else:
current_content = str(handle.read())
# convert content to BeautifulSoup for easy html parsing
soup = BeautifulSoup(current_content, "lxml")
# grab the title of the page, store file name if title isn't available (e.g. PDF file)
current_title = str(soup.title.string) if soup.title is not None else current_page.replace(pwd, '')
# hash the content of the page to produce a unique DocumentID
current_doc_id = hashlib.sha256(current_content.encode("utf-8")).hexdigest()
# mark that the page has been visited by adding to visited_url
self.visited_urls[current_page] = (current_title, current_doc_id)
self.num_pages_crawled += 1
print(str(self.num_pages_crawled) + ". " + "Visiting: " +
current_page.replace(self.domain_url, "") + " (" + current_title + ")")
# if the page is an html document, we need to parse it for links
if any((current_page.lower().endswith(ext) for ext in ["/", ".html", ".htm", ".php", ".txt"])):
# remove contents of title tag
[s.extract() for s in soup('title')]
# format the content of the page
formatted_content = codecs.escape_decode(bytes(soup.get_text().lower(), "utf-8"))[0].decode("utf-8", errors='replace')
# store only the words of the file
content_words = list(re.sub('[' + string.punctuation + ']', '', formatted_content).split())
# remove the 'b' character that's prepended when converting
content_words[0] = content_words[0][1:]
# keep track of only those words that are valid and not in the stop word collection
self.doc_words[current_doc_id] = [w for w in content_words
if w not in self.stop_words and self.word_is_valid(w)]
# store the title
self.doc_titles[current_doc_id] = current_title
# store the url if it hasn't been stored already (to avoid duplicates)
if current_doc_id not in self.doc_urls:
self.doc_urls[current_doc_id] = current_page
self.num_pages_indexed += 1
# go through each link in the page
for link in soup.find_all('a'):
# current_url refers to the current link within the current page being processed
current_url = link.get('href')
# expand the url to include the domain
if current_url is not None and pwd not in current_url:
# only works if the resulting link is valid
current_url = urllib.parse.urljoin(pwd, current_url)
# the link should be visited
if current_url is not None and self.url_is_valid(current_url):
# the link is within scope and hasn't been added to the queue
if self.url_is_within_scope(current_url) and current_url not in self.url_frontier:
# ensure the hasn't been visited before adding it to the queue
if current_url not in self.visited_urls.keys():
self.url_frontier.append(current_url)
elif not self.url_is_within_scope(current_url) and current_url not in self.outgoing_urls:
self.outgoing_urls.append(current_url)
# the link is broken
elif current_url not in self.broken_urls and current_url is not None:
self.broken_urls.append(current_url)
# file is a graphic, mark it as such
elif any(current_page.lower().endswith(ext) for ext in [".gif", ".png", ".jpeg", ".jpg"]):
self.graphic_urls.append(current_page)
else:
print("Not allowed: " + current_page.replace(self.domain_url, ""))
'''
convert word listings into term-document frequency matrix
populates frequency_matrix and all_terms
'''
def build_frequency_matrix(self):
if self.doc_words is not None:
# use porter stemmer for comparing words
stemmer = PorterStemmer()
# grab the unique, stemmed terms from all the documents
self.all_terms = list(set([stemmer.stem(word) for word_list in self.doc_words.values() for word in word_list]))
self.all_terms.sort()
# initialize frequency matrix for one row per term
self.frequency_matrix = [[] for i in self.all_terms]
# add term frequencies to the matrix
for term in range(len(self.all_terms)):
# append the number of times the stemmed words match
frequency_count = []
for word_list in self.doc_words.values():
stemmed_word_list = [stemmer.stem(word) for word in word_list]
frequency_count.append(stemmed_word_list.count(self.all_terms[term]))
self.frequency_matrix[term] = frequency_count
# returns the contents of self.frequency_matrix
def print_frequency_matrix(self):
output_string = ","
if self.doc_words is not None:
# create file heading
for i in range(len(self.doc_words.keys())):
output_string += "Doc" + str(i) + ","
output_string += "\n"
# add matrix row to output string
for i in range(len(self.frequency_matrix)):
output_string += self.all_terms[i] + "," + ",".join([str(i) for i in self.frequency_matrix[i]]) + "\n"
return output_string
# returns a zip object containing n elements: (term, total term frequency, doc frequency)
def n_most_common(self, n):
term_totals = []
sorted_terms = self.all_terms
doc_freqs = []
# calculate the total frequencies and doc frequencies of each term
for row in self.frequency_matrix:
term_totals.append(sum(row))
doc_freqs.append(sum([1 for x in row if x > 0]))
# sort terms and doc frequencies based off total frequencies
sorted_terms = [x for _, x in sorted(zip(term_totals, sorted_terms))]
doc_freqs = [x for _, x in sorted(zip(term_totals, doc_freqs))]
term_totals.sort()
# return n most common
return zip(reversed(sorted_terms[-n:]), reversed(term_totals[-n:]), reversed(doc_freqs[-n:]))