forked from sheepzh/poetry
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwww_gucheng_net.py
71 lines (59 loc) · 2.06 KB
/
www_gucheng_net.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import requests
from bs4 import BeautifulSoup
import re
import os
import sys
import time
from util import Profile, write_poem
URL = 'http://www.gucheng.net/'
HOME_PAGE = URL + 'gc/gczp/gcsg/sgzj/200806/5798.html'
def read_poem(href):
poem_url = URL+href
response = requests.get(poem_url)
response.encoding = 'GB2312'
text = response.text.replace('<br/>', '\n')
soup = BeautifulSoup(text, "lxml")
title_container = soup.find('td', class_='main_ArticleTitle')
title = ''
if title_container:
title = title_container.text.strip()
content_td = soup.find('td', id='fontzoom')
if not content_td:
print('No content: ' + poem_url)
return
content_p = content_td.find_all('p')
lines = []
for p in content_p:
lines.append('\r\n\r\n')
p_contents = p.contents
for p_c in p_contents:
line = str(p_c).strip().replace('<br/>', '\r\n')
lines.append(line)
poem_content = ''.join(lines)
# print(title, poem_url)
write_poem(Profile(href=poem_url, author='顾城', title=title), poem_content)
def main():
response = requests.get(
'http://www.gucheng.net/gc/gczp/gcsg/sgzj/200806/5798.html')
soup = BeautifulSoup(response.content, "lxml", from_encoding="GB2312")
# print(soup)
mainbox = soup.find('table', class_='mainbox')
# print(mainbox)
hrefs = []
mso_normal_p = mainbox.find_all('p', class_='MsoNormal')
for p in mso_normal_p:
for a in p.find_all('a'):
href = a.get('href')
if not href:
continue
href = href.strip()
title = a.text.strip()
valid_href = href and href not in hrefs and not href.endswith(
'200809/5758.html') and 'http' not in href and 'jpg' not in href
valid_title = '(寓' not in title and '(歌词' not in title and '(旧' not in title and '(工' not in title
if valid_href and valid_title:
hrefs.append(href)
for href in hrefs:
# time.sleep(1)
read_poem(href)
main()