import requests
import time
from lxml import etree
import re
import xlwt
import random
import xlrd
from multiprocessing import Process
class Yimaitong():
def __init__(self):
# 请求的url
self.url = 'http://disease.medlive.cn/wiki/list/171'
f = time.time()
nd = '%s' % int(round(f))
# 请求头
self.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Cookie': 'eyJ1aWQiOiIwIiwicmVzb3VyY2UiOiIiLCJhcHBfbmFtZSI6IiIsImV4dF92ZXJzaW9uIjoiMSJ9',
'Host': 'www.medlive.cn',
'Referer': 'http://disease.medlive.cn/wiki/list/178',
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36',
}
header = {
'Referer': 'http://disease.medlive.cn/wiki/entry/0_0_37977?row=1',
'Cookie': 'ymt_pk_id=b7470997e9241352; _pk_ses.3.a971=*; sess=3ve23k417je2d1hhipuku66fg2; ymtinfo=eyJ1aWQiOiIwIiwicmVzb3VyY2UiOiIiLCJhcHBfbmFtZSI6IiIsImV4dF92ZXJzaW9uIjoiMSJ9; Hm_lvt_62d92d99f7c1e7a31a11759de376479f=1588835592,1589005542,1589006182; Hm_lpvt_62d92d99f7c1e7a31a11759de376479f=1589013816; _pk_id.3.a971=b7470997e9241352.1588835592.7.1589013816.1588925929.',
}
def parser_url(self, start, end, pathtxt):
# 解析第一次返回的数据
response = requests.get(self.url)
html = etree.HTML(response.text)
href = html.xpath('//*[@id="wiki_list_box"]/div[1]/div[2]/ul/li/a/@href')
fina_dic = {}
fina_data = []
# for url in range(2):
for url in range(start, end):
# 遍历所需要的下标,下标对应的网址
f = []
time.sleep(0.2)
new_url = 'http://disease.medlive.cn' + href[url]
print(url, new_url)
# 解析url
first_response = requests.get(new_url)
new_html = etree.HTML(first_response.text)
first_href = new_html.xpath('//*[@id="wiki_list_box"]/div[2]/ul/li/dl/dd/a/@href')
# 获取疾病分类
jibing_name = new_html.xpath('//*[@id="wiki_list_box"]/div[1]/div[2]/ul/li[' + str(url + 1) + ']/a/text()')[
0]
# 解析第二次的url
for second in first_href:
list_data = []
# for second in first_href:
time.sleep(1)
try:
second_url = 'http://disease.medlive.cn' + second
second_response = requests.get(second_url)
second_html = etree.HTML(second_response.text)
second_name = second_html.xpath('/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/label/text()')[0]
second_href = second_html.xpath('/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/a/@href')[0]
three_response = requests.get(second_href)
# 获取精要
three_html = etree.HTML(three_response.text)
three_url = three_html.xpath('//*[@id="content"]/div/div[1]/div[1]/div[2]/div[1]/dl/dd[3]/a/@href')[
0]
four_url = 'http://disease.medlive.cn' + three_url
four_response = requests.get(four_url)
four_html = etree.HTML(four_response.text)
time.sleep(0.2)
# 点击详细
one_detail = \
four_html.xpath(
'//*[@id="content"]/div/div[1]/div[2]/div/div[2]/table/tbody/tr[2]/td[1]/p/a/@href')[0]
one_detail_url = 'http://disease.medlive.cn' + one_detail
# print(1,one_detail_url)
one_detail_response = requests.get(one_detail_url)
one_detail_html = etree.HTML(one_detail_response.text)
keyword = ['关键因素', ]
keyword.append(
one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[1]/ul/li/div/h5/span[1]/text()'))
key_data = ['关键内容', ]
key_data.append(
one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[1]/ul/li/div[2]/div/p/text()'))
otherword = ['其它诊断因素', ]
otherword.append(
one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[2]/ul/li/div/h5/span[1]/text()'))
other_data = ['其他诊断内容', ]
other_data.append(
one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[2]/ul/li/div[2]/div/p/text()'))
dengerword = ['危险因素', ]
dengerword.append(
one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[3]/ul/li/div/h5/span[1]/text()'))
dengerdata = ['危险内容', ]
dengerdata.append(
one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[3]/ul/li/div[2]/div/p/text()'))
list_data.append(keyword)
list_data.append(key_data)
list_data.append(dengerword)
list_data.append(dengerdata)
list_data.append(otherword)
list_data.append(other_data)
# time.sleep(random.randint(2, 8))
two_detail = \
four_html.xpath(
'//*[@id="content"]/div/div[1]/div[2]/div/div[2]/table/tbody/tr[2]/td[2]/p/a/@href')[0]
two_detail_url = 'http://disease.medlive.cn' + two_detail
# print(2,two_detail_url)
two_detail_response = requests.get(two_detail_url)
two_detail_html = etree.HTML(two_detail_response.text)
precedence = ['优先检测', ]
precedence.append(
two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[1]/ul/li/div[1]/h5/span/text()'))
precedencedata = ['优先检测内容', ]
precedencedata.append(
two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[1]/ul/li/div[2]/table/tbody//text()'))
# for aaa in nnnn:
# print(aaa.text)
# print(666)
select = ['可选检测', ]
select.append(two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[2]/ul/li/div[1]/h5/text()'))
selectdata = ['可选检测内容', ]
selectdata.append(
two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[2]/ul/li/div[2]/table/tbody//text()'))
new = ['新的检测', ]
new.append(two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[3]/ul/li/div[1]/h5/span/text()'))
newdata = ['新的检测内容', ]
newdata.append(
two_detail_html.xpath(
'//*[@id="wiki_view_frm"]/div/div[3]/ul/li/div[2]/table/tbody/tr//text()'))
list_data.append(precedence)
list_data.append(precedencedata)
list_data.append(select)
list_data.append(selectdata)
list_data.append(new)
list_data.append(newdata)
# time.sleep(random.randint(1, 5))
three_detail = \
four_html.xpath(
'//*[@id="content"]/div/div[1]/div[2]/div/div[2]/table/tbody/tr[2]/td[3]/p/a/@href')[0]
three_detail_url = 'http://disease.medlive.cn' + three_detail
# print(3,three_detail_url)
three_detail_response = requests.get(three_detail_url)
three_detail_html = etree.HTML(three_detail_response.text)
Treatment_conditions = three_detail_html.xpath(
'//*[@id="wiki_view_frm"]/div/div[1]/ul/li/a//text()')
Treatment_conditions_url = three_detail_html.xpath(
'//*[@id="wiki_view_frm"]/div/div[1]/ul/li/a/@href')
p = r'\S+'
three_re = ['治疗细则', ]
t_re = []
for i in Treatment_conditions:
t_re.append(re.findall(p, i)[0])
three_re.append(t_re)
three_data = ['治疗细则内容', ]
for Treatment_url in Treatment_conditions_url:
new_Treatment_url = 'http://disease.medlive.cn' + Treatment_url
new_Treatment_urlresponse = requests.get(new_Treatment_url, headers=self.header)
Treatment_urlresponse = etree.HTML(new_Treatment_urlresponse.text)
three_data.append(Treatment_urlresponse.xpath(
'//*[@id="wiki_view_frm"]/div/div[1]/div[2]//text()'))
list_data.append(three_re)
list_data.append(three_data)
detail_data = {second_name: list_data}
f.append(detail_data)
fina_dic = {jibing_name: f}
except:
pass
fina_data.append(fina_dic)
# with open(pathtxt, 'w+')as f:
# f.write(str(fina_data))
# print(fina_data)
print(fina_data)
# 返回获取的数据
return fina_data
def first_parser_data(self, data, path):
# 将获取的数据进行解析
# 写入文件
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('Sheet1')
row = 2
for i in data: # 拿到儿科 jibingkey 儿科:[]
for jibingkey, jibingvalue in i.items(): # 拿到
print(0, jibingkey)
worksheet.write(row, 0, jibingkey)
for bingzhong in jibingvalue:
for bingzhongkey, bingzhongvalue in bingzhong.items():
# 拿到病种 bingzhongkey
print(1, bingzhongkey)
worksheet.write(row, 1, bingzhongkey)
for detail_data in bingzhongvalue:
# 拿到细则列表 判断长度并写入
# print(detail_data)
for write_num in detail_data[1]:
if len(detail_data[1]) == 0:
detail_data[1].append('0')
for write_data in write_num:
print(2, detail_data[0])
worksheet.write(row, 2, detail_data[0])
print(3, write_data)
worksheet.write(row, 3, write_data)
# print(4,write_data)
row += 1
# workbook.save('yimaitong.xlsx')
workbook.save(path)
pass
def parser_data(self, data):
# 写入文件
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('Sheet1')
row1 = 0
row3 = 0
row = 0
a = 0
for i in data: # 拿到儿科 jibingkey {儿科:[]}
for jibingkey, jibingvalue in i.items(): # 拿到 儿科:[]
worksheet.write(row3, 0, jibingkey)
for bingzhong in jibingvalue: # 拿到{Alport综合征:[]}
for bingzhongkey, bingzhongvalue in bingzhong.items(): # 拿到 [[因素,[]]]
worksheet.write(row1, 1, bingzhongkey)
for detail_data in bingzhongvalue: # 遍历每一组因素 ['关键因素', ['肾脏病变', '听力障碍']]
worksheet.write(a, 2, detail_data[0])
a += 1
col = 3
for write_num in detail_data[1]:
worksheet.write(row, col, write_num)
col += 1
row += 1
row3 += 1
row1 += 7
row3 += row1
workbook.save('医脉通—数据11-18.xlsx')
def main(self):
process_list = []
yimaitong = Yimaitong()
pathtxt1 = 'yimaitong 1-5.txt'
data1 = yimaitong.parser_url(0, 5, pathtxt1)
path1 = 'yimaitong 1-5.xlsx'
# yimaitong.first_parser_data(data1, path1)
p1 = Process(target=self.first_parser_data, args=(data1, path1))
p1.start()
pathtxt2 = 'yimaitong 5-10.txt'
data2 = yimaitong.parser_url(5, 10, pathtxt2)
path2 = 'yimaitong 5-10.xlsx'
# yimaitong.first_parser_data(data2, path2)
p2 = Process(target=self.first_parser_data, args=(data2, path2))
p2.start()
pathtxt3 = 'yimaitong 10-15.txt'
data3 = yimaitong.parser_url(10, 15, pathtxt3)
path3 = 'yimaitong 10-15.xlsx'
# yimaitong.first_parser_data(data3, path3)
p3 = Process(target=self.first_parser_data, args=(data3, path3))
p3.start()
pathtxt4 = 'yimaitong 15-20.txt'
data4 = yimaitong.parser_url(15, 20, pathtxt4)
path4 = 'yimaitong 15-20.xlsx'
# yimaitong.first_parser_data(data4, path4)
p4 = Process(target=self.first_parser_data, args=(data4, path4))
p4.start()
pathtxt5 = 'yimaitong 20-25.txt'
data5 = yimaitong.parser_url(20, 25, pathtxt5)
path5 = 'yimaitong 20-25.xlsx'
# yimaitong.first_parser_data(data4, path4)
p5 = Process(target=self.first_parser_data, args=(data5, path5))
p5.start()
process_list.append(p1)
process_list.append(p2)
process_list.append(p3)
process_list.append(p4)
process_list.append(p5)
for t in process_list:
t.join()
yimaitong = Yimaitong()
# yimaitong.main()
pathtxt1 = '内容.txt'
data = yimaitong.parser_url(0, 1, pathtxt1)
# yimaitong.parser_data(data)
医脉通数据爬取 http://disease.medlive.cn
最新推荐文章于 2021-02-28 07:00:51 发布