-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathguardian.py
executable file
·40 lines (34 loc) · 1.32 KB
/
guardian.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# encoding=utf8
import requests
from bs4 import BeautifulSoup
import os
sport_name = ['football']
month = '2018/may/'
days = [i for i in range(1,31)]
link_count = 0
for s_name in sport_name:
download_folder = os.path.join(os.getcwd(), s_name)
if not os.path.exists(download_folder):
os.makedirs(download_folder)
for day in days:
url = 'https://www.theguardian.com/'+s_name+'/'+month+str(day)+'/all'
r = requests.get(url, allow_redirects=False)
html = r.text
soup = BeautifulSoup(html, "html5lib")
links = []
for a in soup.find_all('a', href=True):
link_test = a['href'].split('/')
if '2018' in link_test and 'may' in link_test and str(day) in link_test:
links.append(a['href'])
for link in links:
res = requests.get(link, allow_redirects=False)
html1 = res.text
soup1 = BeautifulSoup(html1, "html5lib")
download_path = os.path.join(download_folder, os.path.basename(link))
file = open(download_path+'.txt','wb')
for p in soup1.find_all('p'):
file.write(p.text.encode('utf-8'))
file.close()
print("Completed {}".format(link))
link_count+=1
print("Total Number of articles extracted: {}".format(link_count))