医脉通数据爬取 http://disease.medlive.cn

最新推荐文章于 2021-02-28 07:00:51 发布
python小白努力中
最新推荐文章于 2021-02-28 07:00:51 发布
阅读量1.1k
点赞数
分类专栏：爬虫
本文链接：https://blog.csdn.net/lh_hebine/article/details/106194956
版权
爬虫专栏收录该内容
8 篇文章
订阅专栏

import requests
import time
from lxml import etree
import re
import xlwt
import random
import xlrd

from multiprocessing import Process


class Yimaitong():
    def __init__(self):
        # 请求的url
        self.url = 'http://disease.medlive.cn/wiki/list/171'
        f = time.time()
        nd = '%s' % int(round(f))
        # 请求头
        self.headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'Cookie': 'eyJ1aWQiOiIwIiwicmVzb3VyY2UiOiIiLCJhcHBfbmFtZSI6IiIsImV4dF92ZXJzaW9uIjoiMSJ9',
            'Host': 'www.medlive.cn',
            'Referer': 'http://disease.medlive.cn/wiki/list/178',
            'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36',
        }

    header = {
        'Referer': 'http://disease.medlive.cn/wiki/entry/0_0_37977?row=1',
        'Cookie': 'ymt_pk_id=b7470997e9241352; _pk_ses.3.a971=*; sess=3ve23k417je2d1hhipuku66fg2; ymtinfo=eyJ1aWQiOiIwIiwicmVzb3VyY2UiOiIiLCJhcHBfbmFtZSI6IiIsImV4dF92ZXJzaW9uIjoiMSJ9; Hm_lvt_62d92d99f7c1e7a31a11759de376479f=1588835592,1589005542,1589006182; Hm_lpvt_62d92d99f7c1e7a31a11759de376479f=1589013816; _pk_id.3.a971=b7470997e9241352.1588835592.7.1589013816.1588925929.',
    }

    def parser_url(self, start, end, pathtxt):
        # 解析第一次返回的数据
        response = requests.get(self.url)
        html = etree.HTML(response.text)
        href = html.xpath('//*[@id="wiki_list_box"]/div[1]/div[2]/ul/li/a/@href')
        fina_dic = {}
        fina_data = []

        # for url in range(2):
        for url in range(start, end):
            # 遍历所需要的下标，下标对应的网址
            f = []
            time.sleep(0.2)
            new_url = 'http://disease.medlive.cn' + href[url]
            print(url, new_url)

            # 解析url
            first_response = requests.get(new_url)
            new_html = etree.HTML(first_response.text)
            first_href = new_html.xpath('//*[@id="wiki_list_box"]/div[2]/ul/li/dl/dd/a/@href')
            # 获取疾病分类
            jibing_name = new_html.xpath('//*[@id="wiki_list_box"]/div[1]/div[2]/ul/li[' + str(url + 1) + ']/a/text()')[
                0]

            # 解析第二次的url
            for second in first_href:
                list_data = []
                # for second in first_href:
                time.sleep(1)
                try:
                    second_url = 'http://disease.medlive.cn' + second

                    second_response = requests.get(second_url)
                    second_html = etree.HTML(second_response.text)
                    second_name = second_html.xpath('/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/label/text()')[0]

                    second_href = second_html.xpath('/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/a/@href')[0]
                    three_response = requests.get(second_href)
                    # 获取精要
                    three_html = etree.HTML(three_response.text)
                    three_url = three_html.xpath('//*[@id="content"]/div/div[1]/div[1]/div[2]/div[1]/dl/dd[3]/a/@href')[
                        0]
                    four_url = 'http://disease.medlive.cn' + three_url
                    four_response = requests.get(four_url)
                    four_html = etree.HTML(four_response.text)
                    time.sleep(0.2)
                    # 点击详细
                    one_detail = \
                        four_html.xpath(
                            '//*[@id="content"]/div/div[1]/div[2]/div/div[2]/table/tbody/tr[2]/td[1]/p/a/@href')[0]
                    one_detail_url = 'http://disease.medlive.cn' + one_detail
                    # print(1,one_detail_url)
                    one_detail_response = requests.get(one_detail_url)
                    one_detail_html = etree.HTML(one_detail_response.text)
                    keyword = ['关键因素', ]
                    keyword.append(
                        one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[1]/ul/li/div/h5/span[1]/text()'))
                    key_data = ['关键内容', ]
                    key_data.append(
                        one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[1]/ul/li/div[2]/div/p/text()'))
                    otherword = ['其它诊断因素', ]
                    otherword.append(
                        one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[2]/ul/li/div/h5/span[1]/text()'))
                    other_data = ['其他诊断内容', ]
                    other_data.append(
                        one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[2]/ul/li/div[2]/div/p/text()'))
                    dengerword = ['危险因素', ]
                    dengerword.append(
                        one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[3]/ul/li/div/h5/span[1]/text()'))
                    dengerdata = ['危险内容', ]
                    dengerdata.append(
                        one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[3]/ul/li/div[2]/div/p/text()'))

                    list_data.append(keyword)
                    list_data.append(key_data)
                    list_data.append(dengerword)
                    list_data.append(dengerdata)
                    list_data.append(otherword)
                    list_data.append(other_data)
                    # time.sleep(random.randint(2, 8))
                    two_detail = \
                        four_html.xpath(
                            '//*[@id="content"]/div/div[1]/div[2]/div/div[2]/table/tbody/tr[2]/td[2]/p/a/@href')[0]
                    two_detail_url = 'http://disease.medlive.cn' + two_detail
                    # print(2,two_detail_url)
                    two_detail_response = requests.get(two_detail_url)
                    two_detail_html = etree.HTML(two_detail_response.text)
                    precedence = ['优先检测', ]
                    precedence.append(
                        two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[1]/ul/li/div[1]/h5/span/text()'))
                    precedencedata = ['优先检测内容', ]
                    precedencedata.append(
                        two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[1]/ul/li/div[2]/table/tbody//text()'))
                    # for aaa in nnnn:
                    #     print(aaa.text)
                    # print(666)
                    select = ['可选检测', ]
                    select.append(two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[2]/ul/li/div[1]/h5/text()'))
                    selectdata = ['可选检测内容', ]
                    selectdata.append(
                        two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[2]/ul/li/div[2]/table/tbody//text()'))
                    new = ['新的检测', ]
                    new.append(two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[3]/ul/li/div[1]/h5/span/text()'))
                    newdata = ['新的检测内容', ]
                    newdata.append(
                        two_detail_html.xpath(
                            '//*[@id="wiki_view_frm"]/div/div[3]/ul/li/div[2]/table/tbody/tr//text()'))
                    list_data.append(precedence)
                    list_data.append(precedencedata)
                    list_data.append(select)
                    list_data.append(selectdata)
                    list_data.append(new)
                    list_data.append(newdata)
                    # time.sleep(random.randint(1, 5))
                    three_detail = \
                        four_html.xpath(
                            '//*[@id="content"]/div/div[1]/div[2]/div/div[2]/table/tbody/tr[2]/td[3]/p/a/@href')[0]
                    three_detail_url = 'http://disease.medlive.cn' + three_detail
                    # print(3,three_detail_url)
                    three_detail_response = requests.get(three_detail_url)
                    three_detail_html = etree.HTML(three_detail_response.text)
                    Treatment_conditions = three_detail_html.xpath(
                        '//*[@id="wiki_view_frm"]/div/div[1]/ul/li/a//text()')
                    Treatment_conditions_url = three_detail_html.xpath(
                        '//*[@id="wiki_view_frm"]/div/div[1]/ul/li/a/@href')
                    p = r'\S+'
                    three_re = ['治疗细则', ]
                    t_re = []
                    for i in Treatment_conditions:
                        t_re.append(re.findall(p, i)[0])
                    three_re.append(t_re)
                    three_data = ['治疗细则内容', ]
                    for Treatment_url in Treatment_conditions_url:
                        new_Treatment_url = 'http://disease.medlive.cn' + Treatment_url
                        new_Treatment_urlresponse = requests.get(new_Treatment_url, headers=self.header)
                        Treatment_urlresponse = etree.HTML(new_Treatment_urlresponse.text)
                        three_data.append(Treatment_urlresponse.xpath(
                            '//*[@id="wiki_view_frm"]/div/div[1]/div[2]//text()'))
                    list_data.append(three_re)
                    list_data.append(three_data)
                    detail_data = {second_name: list_data}
                    f.append(detail_data)
                    fina_dic = {jibing_name: f}
                except:
                    pass

            fina_data.append(fina_dic)
        # with open(pathtxt, 'w+')as f:
        #     f.write(str(fina_data))
        # print(fina_data)
        print(fina_data)
        # 返回获取的数据
        return fina_data

    def first_parser_data(self, data, path):
        # 将获取的数据进行解析
        # 写入文件
        workbook = xlwt.Workbook(encoding='utf-8')
        worksheet = workbook.add_sheet('Sheet1')
        row = 2
        for i in data:  # 拿到儿科 jibingkey 儿科：[]
            for jibingkey, jibingvalue in i.items():  # 拿到
                print(0, jibingkey)
                worksheet.write(row, 0, jibingkey)
                for bingzhong in jibingvalue:
                    for bingzhongkey, bingzhongvalue in bingzhong.items():
                        # 拿到病种 bingzhongkey
                        print(1, bingzhongkey)
                        worksheet.write(row, 1, bingzhongkey)
                        for detail_data in bingzhongvalue:
                            # 拿到细则列表  判断长度并写入
                            # print(detail_data)
                            for write_num in detail_data[1]:
                                if len(detail_data[1]) == 0:
                                    detail_data[1].append('0')
                                for write_data in write_num:
                                    print(2, detail_data[0])
                                    worksheet.write(row, 2, detail_data[0])
                                    print(3, write_data)
                                    worksheet.write(row, 3, write_data)
                                    # print(4,write_data)
                                    row += 1

        # workbook.save('yimaitong.xlsx')
        workbook.save(path)

        pass

    def parser_data(self, data):
        # 写入文件
        workbook = xlwt.Workbook(encoding='utf-8')
        worksheet = workbook.add_sheet('Sheet1')
        row1 = 0
        row3 = 0
        row = 0
        a = 0
        for i in data:  # 拿到儿科 jibingkey {儿科：[]}
            for jibingkey, jibingvalue in i.items():  # 拿到 儿科：[]
                worksheet.write(row3, 0, jibingkey)
                for bingzhong in jibingvalue:  # 拿到{Alport综合征：[]}
                    for bingzhongkey, bingzhongvalue in bingzhong.items():  # 拿到 [[因素，[]]]
                        worksheet.write(row1, 1, bingzhongkey)
                        for detail_data in bingzhongvalue:  # 遍历每一组因素  ['关键因素', ['肾脏病变', '听力障碍']]
                            worksheet.write(a, 2, detail_data[0])
                            a += 1
                            col = 3
                            for write_num in detail_data[1]:
                                worksheet.write(row, col, write_num)
                                col += 1
                            row += 1
                            row3 += 1
                    row1 += 7
                row3 += row1
        workbook.save('医脉通—数据11-18.xlsx')

    def main(self):
        process_list = []
        yimaitong = Yimaitong()

        pathtxt1 = 'yimaitong 1-5.txt'
        data1 = yimaitong.parser_url(0, 5, pathtxt1)
        path1 = 'yimaitong 1-5.xlsx'
        # yimaitong.first_parser_data(data1, path1)
        p1 = Process(target=self.first_parser_data, args=(data1, path1))
        p1.start()

        pathtxt2 = 'yimaitong 5-10.txt'
        data2 = yimaitong.parser_url(5, 10, pathtxt2)
        path2 = 'yimaitong 5-10.xlsx'
        # yimaitong.first_parser_data(data2, path2)
        p2 = Process(target=self.first_parser_data, args=(data2, path2))
        p2.start()

        pathtxt3 = 'yimaitong 10-15.txt'
        data3 = yimaitong.parser_url(10, 15, pathtxt3)
        path3 = 'yimaitong 10-15.xlsx'
        # yimaitong.first_parser_data(data3, path3)
        p3 = Process(target=self.first_parser_data, args=(data3, path3))
        p3.start()

        pathtxt4 = 'yimaitong 15-20.txt'
        data4 = yimaitong.parser_url(15, 20, pathtxt4)
        path4 = 'yimaitong 15-20.xlsx'
        # yimaitong.first_parser_data(data4, path4)
        p4 = Process(target=self.first_parser_data, args=(data4, path4))
        p4.start()

        pathtxt5 = 'yimaitong 20-25.txt'
        data5 = yimaitong.parser_url(20, 25, pathtxt5)
        path5 = 'yimaitong 20-25.xlsx'
        # yimaitong.first_parser_data(data4, path4)
        p5 = Process(target=self.first_parser_data, args=(data5, path5))
        p5.start()

        process_list.append(p1)
        process_list.append(p2)
        process_list.append(p3)
        process_list.append(p4)
        process_list.append(p5)
        for t in process_list:
            t.join()


yimaitong = Yimaitong()
# yimaitong.main()
pathtxt1 = '内容.txt'
data = yimaitong.parser_url(0, 1, pathtxt1)
# yimaitong.parser_data(data)