-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsitemap_gen.py
358 lines (301 loc) · 11.3 KB
/
sitemap_gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
#! /usr/bin/env python
"""
Copyright (C) 2007-2009 Vladimir Toncar
Contributors:
Redirect handling by Pavel "ShadoW" Dvorak
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
"""
import sys
import getopt
from urllib import request, robotparser, parse
from urllib.error import URLError
from html.parser import HTMLParser
import xml.sax.saxutils
from datetime import datetime
helpText = """sitemap_gen.py version 1.1.0 (2009-09-05)
Python 3 port by Tassone Roberto (2022-08-14)
This script crawls a web site from a given starting URL and generates
a Sitemap file in the format that is accepted by Google. The crawler
does not follow links to other web sites. It also respects the 'nofollow'
tags and will not crawl into directories disallowed in the robots.txt file.
Command line syntax:
python sitemap_gen.py <options> <starting URL>
Available options:
-h --help Print this text and exit
-b <ext> --block <ext> Exclude URLs with the given extension;
<ext> must be without the leading dot.
The comparison is case insensitive, so
for example DOC and doc are treated
the same. You can use this option several
times to block several extensions.
Some multimedia/document extensions
(e.g. JPG, MP3, etc.) are blocked by
default.
-c <value> --changefreq <value> Set the change frequency. The given value
is used in all sitemap entries (maybe a
future version of this script will change
that). The allowed values are: always,
hourly, daily, weekly, monthly, yearly,
never.
-p <prio> --priority <prio> Set the priority. The value must be from
the interval between 0.0 and 1.0. The value
will be used in all sitemap entries.
-m <value> --max-urls <value> Set the maximum number of URLs to be crawled.
The default value is 1000 and the largest
value that you can set is 50000 (the script
generates only a single sitemap file).
-o <file> --output-file <file> Set the name of the geneated sitemap file.
The default file name is sitemap.xml.
Usage example:
python sitemap_gen.py -b doc -b bmp -o test_sitemap.xml http://www.your-site-name.com/index.html
For more information, visit http://toncar.cz/opensource/sitemap_gen.html
"""
allowedChangefreq = [
"always",
"hourly",
"daily",
"weekly",
"monthly",
"yearly",
"never"
]
def getPage(url):
try:
f = request.urlopen(url)
page = ""
for i in f.readlines():
page = page + i.decode('UTF-8')
date = f.getheader('Last-Modified')
if date == None:
date = (0, 0, 0)
else:
formattedDate = datetime.strptime(date, '%a, %d %b %Y %H:%M:%S %Z')
date = (formattedDate.year, formattedDate.month, formattedDate.day)
f.close()
return (page, date, f.url)
except (URLError, UnicodeDecodeError) as detail:
print("%s. Skipping..." % (detail))
return (None, (0, 0, 0), "")
# end def
def joinUrls(baseUrl, newUrl):
helpUrl, fragment = parse.urldefrag(newUrl)
return parse.urljoin(baseUrl, helpUrl)
# end def
def getRobotParser(startUrl):
rp = robotparser.RobotFileParser()
robotUrl = parse.urljoin(startUrl, "/robots.txt")
page, date, url = getPage(robotUrl)
if page == None:
print("Could not read ROBOTS.TXT at:", robotUrl)
return None
# end if
rp.parse(page)
print("Found ROBOTS.TXT at:", robotUrl)
return rp
# end def
class MyHTMLParser(HTMLParser):
def __init__(self, pageMap, redirects, baseUrl, maxUrls, blockExtensions, robotParser):
HTMLParser.__init__(self)
self.pageMap = pageMap
self.redirects = redirects
self.baseUrl = baseUrl
self.server = parse.urlsplit(baseUrl)[1]
self.maxUrls = maxUrls
self.blockExtensions = blockExtensions
self.robotParser = robotParser
# end def
def hasBlockedExtension(self, url):
p = parse.urlparse(url)
path = p[2].upper() # path attribute
# In python 2.5, endswith() also accepts a tuple,
# but let's make it backwards compatible
for i in self.blockExtensions:
if path.endswith(i):
return 1
return 0
# end def
def handle_starttag(self, tag, attrs):
if len(self.pageMap) >= self.maxUrls:
return
if (tag.upper() == "BASE"):
if (attrs[0][0].upper() == "HREF"):
self.baseUrl = joinUrls(self.baseUrl, attrs[0][1])
print("BASE URL set to", self.baseUrl)
if (tag.upper() == "A"):
# print "Attrs:", attrs
url = ""
# Let's scan the list of tag's attributes
for attr in attrs:
# print " attr:", attr
if (attr[0].upper() == "REL") and (attr[1].upper().find('NOFOLLOW') != -1):
# We have discovered a nofollow, so we won't continue
return
elif (attr[0].upper() == "HREF") and (attr[1].upper().find('MAILTO:') == -1):
# We have discovered a link that is not a Mailto:
url = joinUrls(self.baseUrl, attr[1])
# end for
# if the url is empty, there was none in the list of attributes
if url == "":
return
# Check if we want to follow the link
if parse.urlsplit(url)[1] != self.server:
return
if self.hasBlockedExtension(url) or self.redirects.count(url) > 0:
return
if (self.robotParser != None) and not(self.robotParser.can_fetch("*", url)):
print("URL restricted by ROBOTS.TXT: ", url)
return
# It's OK to add url to the map and fetch it later
if not(url in self.pageMap.keys()):
self.pageMap[url] = ()
# end if
# end def
# end class
def getUrlToProcess(pageMap):
for i in pageMap.keys():
if pageMap[i] == ():
return i
return None
def parsePages(startUrl, maxUrls, blockExtensions):
pageMap = {}
pageMap[startUrl] = ()
redirects = []
robotParser = getRobotParser(startUrl)
while True:
url = getUrlToProcess(pageMap)
if url == None:
break
print(" ", url)
page, date, newUrl = getPage(url)
if page == None:
del pageMap[url]
elif url != newUrl:
print("Redirect -> " + newUrl)
del pageMap[url]
pageMap[newUrl] = ()
redirects.append(url)
else:
pageMap[url] = date
parser = MyHTMLParser(pageMap, redirects, url,
maxUrls, blockExtensions, robotParser)
try:
parser.feed(page)
parser.close()
# except HTMLParseError:
# print("Error parsing %s, skipping." % (url))
except UnicodeDecodeError:
print("Failed decoding %s . Try to check if the page is valid." % (url))
# end while
return pageMap
# end def
def generateSitemapFile(pageMap, fileName, changefreq="", priority=0.0):
fw = open(fileName, "wt")
fw.write('''<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n''')
for i in pageMap.keys():
fw.write('<url>\n <loc>%s</loc>\n' % (xml.sax.saxutils.escape(i)))
if pageMap[i] not in [(), (0, 0, 0)]:
fw.write('\t<lastmod>%4d-%02d-%02d</lastmod>\n' % pageMap[i])
if changefreq != "":
fw.write('\t<changefreq>%s</changefreq>\n' % (changefreq))
if priority > 0.0:
fw.write('\t<priority>%1.1f</priority>\n' % (priority))
fw.write('</url>\n')
# end for
fw.write('</urlset>')
fw.close()
# end def
def main():
try:
opts, args = getopt.getopt(sys.argv[1:],
"h:b:c:m:p:o:",
["help", "block=", "changefreq=",
"max-urls=", "priority=", "output-file="])
except getopt.GetoptError:
print(helpText)
return
blockExtensions = [
# images
'JPG',
'JPEG',
'GIF',
'BMP',
'PNG',
# video
'MP4',
'MKV',
'AVI',
# audio
'MP3',
'AAC',
# documents
'PDF',
'DOC',
'DOCX',
'TXT',
# archives
'ZIP',
'RAR',
'TAR',
'GZ'
]
changefreq = ""
priority = 0.0
fileName = "sitemap.xml"
maxUrls = 1000
pageMap = {}
for opt, arg in opts:
if opt in ("-h", "--help"):
print(helpText)
return
elif opt in ("-b", "--block"):
blockExtensions.append("." + arg.upper())
elif opt in ("-c", "--changefreq"):
if arg in allowedChangefreq:
changefreq = arg
else:
print("Allowed changefreq values are:")
for i in allowedChangefreq:
print(i)
print
return
elif opt in ("-m", "--max-urls"):
maxUrls = int(arg)
if (maxUrls < 0) or (maxUrls > 50000):
print(
"The maximum number of URLs must be greater than 0 and smaller than 50000")
return
elif opt in ("-p", "--priority"):
priority = float(arg)
if (priority < 0.0) or (priority > 1.0):
print("Priority must be between 0.0 and 1.0")
return
elif opt in ("-o", "--output-file"):
fileName = arg
if fileName in ("", ".", ".."):
print("Please provide a sensible file name")
return
# end if
if len(args) == 0:
print("You must provide the starting URL.\nTry the -h option for help.")
return
# Set user agent string
opener = request.build_opener()
opener.addheaders = [('User-agent', 'sitemap_gen/1.0')]
request.install_opener(opener)
# Start processing
print("Crawling the site...")
pageMap = parsePages(args[0], maxUrls, blockExtensions)
print("Generating sitemap: %d URLs" % (len(pageMap)))
generateSitemapFile(pageMap, fileName, changefreq, priority)
print("Finished.")
# end def
if __name__ == '__main__':
main()