-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
162 lines (140 loc) · 5.02 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import requests
import bs4
import numpy as np
from bs4 import BeautifulSoup
youlai = "https://www.youlai.cn"
# 准备用于存储数据的npy对象
global QAL, QDAL
QAL = np.empty(dtype=str, shape=[0, 2])
#QDAL=np.empty(dtype=str, shape=[0, 3])
# 从疾病页面寻找的问答列表页面
def findqapage(url):
page = requests.get(url)
# 输出200代表成功访问
if page.status_code != 200:
return
soup = BeautifulSoup(page.content, 'html.parser')
html = soup.find('html')
body = html.find('body')
m = list(body.children)[11]
me = m.find('div')
men = me.find('ul')
for id, item in enumerate(list(men.children)):
if id % 2 == 1:
title = item.find('a')
if title.get_text() == '相关问答':
print(youlai + title['href']) # 输出问答列表的url
return youlai + title['href']
# 遍历问答列表页面的所有问答页面
def QAlist(url):
global QAL, QDAL
page = requests.get(url)
# 输出200代表成功访问
print(page.status_code)
if page.status_code != 200:
return
soup = BeautifulSoup(page.content, 'html.parser')
html = soup.find('html')
if (hasattr(html, 'children')):
body = html.find('body')
q = list(body.children)[18] # 页面主题
qu = list(q.children)[3] # 问题大列表
que = list(qu.children)[1] # 问题小列表
for id, item in enumerate(list(que.children)):
if (id % 2 == 1):
if (hasattr(item, 'children')):
ques = list(item.children)[1]
quest = list(ques.children)[0]
print(youlai + quest['href']) # 输出问答页入口
QApage(youlai + quest['href'])
p = list(qu.children)[3] # 翻页栏
pa = list(p.children)[1]
if len(list(pa.children)) == 2:
pag = list(pa.children)[1]
else:
return
for id, item in enumerate(list(pag.children)):
if (hasattr(item, 'children')):
page = list(item.children)[0]
if (page.get_text() == "下一页"):
print(youlai + page['href']) # 递归到下一页
QAlist(youlai + page['href'])
# 问答页面信息爬取
def QApage(url):
global QAL, QDAL
page = requests.get(url)
# 输出200代表成功访问
print(page.status_code)
if page.status_code != 200:
return
soup = BeautifulSoup(page.content, 'html.parser')
html = soup.find('html')
head = list(html.children)[1]
body = html.find('body')
q = list(body.children)[17]
qu = list(q.children)[3] # 页面主体
que = list(qu.children)[3] # 问题&问题描述
ques = list(que.children)[1]
quest = list(ques.children)[1]
questi = list(quest.children)[1]
print("问题:" + questi) # 问题
Q = questi
'''quem = list(que.children)[3]
quemi = list(quem.children)[4]
print("详细描述:" + quemi.get_text()) # 问题描述
QD = quemi.get_text()'''
a = list(qu.children)[9]
an = list(a.children)[3]
ans = list(an.children)[1]
print("回答:" + ans.get_text()) # 回答
A = ans.get_text()
QAL = np.append(QAL, [[Q, A]], axis=0) # 存储问答对到QAL
#QDAL = np.append(QDAL, [[Q, QD, A]], axis=0)
# 按照字母遍历所有疾病页面
def diseslist(url):
page = requests.get(url)
# 输出200代表成功访问
if page.status_code != 200:
return
soup = BeautifulSoup(page.content, 'html.parser')
html = soup.find('html')
body = html.find('body')
mbody = list(body.children)[11]
mabody = mbody.find('div')
d = list(mabody.children)[9]
di = list(d.children)[3]
dis = di.find('div')
diss = dis.find('dl')
for id, item in enumerate(list(diss.children)):
if id % 2 == 1:
for i in list(item.children):
print(type(i))
for idi, i in enumerate(list(item.children)):
if idi % 2 == 1:
if i.name == 'a':
print(youlai + i['href'])
QAlist_url = findqapage(youlai + i['href'])
if QAlist_url:
QAlist(QAlist_url)
if __name__=="__main__":
url = "https://www.youlai.cn/dise/pz_A_1.html"
page = requests.get(url)
# 输出200代表成功访问
print(page.status_code)
soup = BeautifulSoup(page.content, 'html.parser')
html = soup.find('html')
body = html.find('body')
mbody = list(body.children)[11]
mabody = mbody.find('div')
alp = mabody.find('dl') # 字母表导航栏
alph = alp.find('dd').find('p')
for item in list(alph.children):
if item.name == 'a':
print(youlai + item['href'])
diseslist(youlai + item['href'])
'''print("问答对(不含病情详细描述):")
print(QAL)
print("问答对(包含病情详细描述):")
print(QDAL)'''
filename = 'irqa_data.npy'
np.save(filename, QAL)