forked from sheepzh/poetry
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathzgsgw_net.py
104 lines (78 loc) · 2.9 KB
/
zgsgw_net.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import requests
import re
from util import Profile, write_poem
from bs4 import BeautifulSoup
import time
from random import randint
BASE_URL = 'http://www.zgshige.com'
list_reg = re.compile(r"javascript:window.location.href='(http://www.zgshige.com/zcms/poem/list\?SiteID=\d+&poetname=)'\+encodeURI\('(.+?)'\)\+'(&articleID=\d+&articleContributeUID=\d+&catalogID=\d+)';")
title_reg = re.compile(r'class="sr_dt_title">\s*<a href="(http://www.zgshige.com/c/([\d-]+)/\d+.shtml)">《(.+?)》</a>')
def get(url):
response = requests.get(url)
response.encoding = 'utf-8'
return response.text
def read_content(url):
text = get(url).replace(u'<br>', '\n').replace('<BR>', '\n').replace(u'<br/>', '\n').replace('<BR/>', '\n').replace(u'</P>', '\n</P>').replace(u'</p>', '\n</p>')
soup = BeautifulSoup(text, 'lxml')
result = soup.find('div', id='content').text
return result
def read_list(poet_name, list_url):
'''
读取所有的标题和链接
'''
text = get(list_url).replace(' ', '')
titles = title_reg.findall(text)
for title in titles:
url = title[0]
date = title[1]
content = read_content(url) + '\n' + date
write_poem(Profile(href=url, author=poet_name, title=title[2]), content)
def read_by_poet(url):
'''
根据诗人的 url 获取作品绝对链接
'''
text = get(BASE_URL + url)
list_result = list_reg.findall(text)
if len(list_result):
result = list_result[0]
poet_name = result[1]
url = ''.join(result)
read_list(poet_name, url)
poet_reg = re.compile(r'<a href="(/c/[\d-]+?/\d+.shtml)" class="p-t-xs p-b-xs block text-center" target="_blank">.*?</a>')
def read_teyao():
'''特邀诗人'''
text = get('http://www.zgshige.com/tysr/')
poets = poet_reg.findall(text)
for poet_url in poets:
read_by_poet(poet_url)
print(BASE_URL + poet_url)
time.sleep(randint(4, 10))
def read_zhuzhan():
'''驻站诗人'''
text = get('http://www.zgshige.com/zzsr/')
poets = poet_reg.findall(text)
for poet_url in poets:
read_by_poet(poet_url)
print(BASE_URL + poet_url)
time.sleep(randint(4, 10))
mingjia_reg = re.compile(r'<a class="h4 bold" href="(/c/[\d-]+/\d+?.shtml)" target="_blank">')
def read_mingjia_page(page_num):
if page_num == 1:
url = 'http://www.zgshige.com/zzmjx/index.shtml'
else:
url = 'http://www.zgshige.com/zzmjx/index_{}.shtml'.format(str(page_num))
text = get(url)
poets = mingjia_reg.findall(text)
for url in poets:
read_by_poet(url)
print(BASE_URL + url)
# time.sleep(randint(4, 10))
def read_mingjia():
for i in range(13):
read_mingjia_page(i + 1)
print(i + 1)
read_mingjia()
# read_zhuzhan()
# read_teyao()
# read_by_poet('/c/2018-08-26/6985461.shtml')
# read_content('http://www.zgshige.com/c/2021-01-15/16545743.shtml')