baike爬虫demo

:medicine.dic - 副本.default 是一个17万行的文本,如:
感冒 11111111
发烧 2222222

import urllib.request
import urllib.parse
import re
from bs4 import BeautifulSoup
from lxml import etree
import gc
import random
import time
import winsound

def baike(word):
    def test_url(soup):  # 检测是否收录该词条,返回 True or False
        result = soup.find(text=re.compile("百度百科未收录该词条"))
        del soup
        if result:
            return False
        else:
            return True

    def parse_wirte_info(html,word):
        # 1、h1标签,每个搜索都有数据
        h1_word=''
        smilar_word=''
        h1s=html.xpath('//dl[@class="lemmaWgt-lemmaTitle lemmaWgt-lemmaTitle-"]//h1')
        if (len(h1s)==1):
            h1_word = h1s[0].text
            # print('本次搜索结果中h1标签是:'+h1_word + '\n')
        #同义词
        smilar_words = html.xpath('//span[@class="view-tip-panel"]')
        if (len(smilar_words) == 1):
            smilar_word = smilar_words[0].xpath('string(.)').replace('同义词', '', 1).strip()
            # print('本次搜索结果中同义词有:' + smilar_word + '\n')


        # 2、基本信息模块,部分搜索有数据
        basic_info={}
        basic_info_lists1 = html.xpath('//div[@class="basic-info cmn-clearfix"]//dt')
        basic_info_lists2 = html.xpath('//div[@class="basic-info cmn-clearfix"]//dd')
        for i in range(len(basic_info_lists1)):
            key1 = basic_info_lists1[i].text.strip()
            value1 = basic_info_lists2[i].text.strip()
            basic_info[key1]=value1
            # print(key1+':'+value1)

        # 3、目录模块
        total_lists1 = html.xpath('//div[@class="lemma-catalog"]/div/ol/li/span[1]')
        total_lists2 = html.xpath('//div[@class="lemma-catalog"]/div/ol/li/span[2]/a')
        total = []
        for i in range(len(total_lists2)):
            key1 = total_lists1[i].text
            value1 = total_lists2[i].text
            if key1 in (
            '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19',
            '20'):
                total.append('\n')
            total.append(value1)
        string = ''
        for i in range(len(total)):
            string = string + total[i] + '\t'
        string = string.strip()
        listskey_values = string.split('\n')
        catalog = []
        for listskey_value in listskey_values:
            sub_listskys = listskey_value.split('\t')
            while '' in sub_listskys:
                sub_listskys.remove('')
            catalog.append(sub_listskys)
        # print(catalog)

        # 4、正文提取
        level2 = html.xpath('//div[@class="main-content"]')
        if len(level2) ==1:
            main_content = etree.tostring(level2[0], encoding='utf-8')  # 比较核心的语句etree.tostring()
            lis = main_content.decode()

            content = {}
            h2_lists = lis.split('<div class="para-title level-2" label-module="para-title">')
            # ①不存在h2
            if len(h2_lists) == 1:
                sub_value01 = ''
                contents = etree.HTML(h2_lists[0]).xpath('//div[@class="para"]')
                for k in range(len(contents)):
                    sub_value01 = sub_value01 + contents[k].xpath('string(.)').strip()
                    # print(contents[k].xpath('string(.)').strip())  # 比较核心的语句string(.)
                    content = {'no_h2': sub_value01}
            # ②存在h2
            elif len(h2_lists) > 1:
                for i in range(len(h2_lists[1:])):
                    h2 = etree.HTML(h2_lists[i + 1]).xpath('//h2[@class="title-text"]/text()')
                    # print(i)
                    # print(str(h2[0]))
                    cont_text = ''
                    contents = etree.HTML(h2_lists[i + 1]).xpath('//div[@class="para"]')
                    for k in range(len(contents)):
                        cont_text = cont_text + contents[k].xpath('string(.)').strip()
                    content[str(h2[0])] = cont_text

            dict = {
                'word': word,
                'top_word': h1_word,
                'similar_word': smilar_word,
                'basic_info': basic_info,
                'catalog': catalog,
                'content': content,
            }

            with open('medicine_result.txt', 'a', encoding='utf-8')as fp:
                fp.write(str(dict) + '\n')

    def start(word):
        keyword = urllib.parse.urlencode({"word": word})  # 解析,用于组成URL
        print("http://baike.baidu.com/search/word?%s" % keyword)
        response = urllib.request.urlopen("http://baike.baidu.com/search/word?%s" % keyword,timeout=2000)
        html = response.read()
        soup = BeautifulSoup(html, "html.parser")
        html = etree.HTML(html)
        if test_url(soup):
            parse_wirte_info(html,word)
        response.close()    # 注意关闭response
    try:
        start(word)
        gc.collect()

    except AttributeError:
        print("百度百科未收录该词条")

if (__name__ == "__main__"):
    # 文件存放的绝对路径
    # 结果result.txt存放在当前目录下
    file_path=r'C:\Users\xu134\Documents\WeChat Files\xuyong95901\FileStorage\File\2020-07\medicine.dic - 副本.default'
    with open(file_path, 'r', encoding='utf-8') as f:
        keywords = f.readlines()
        i = 1
        for keyword in keywords:
            print('正在下载第  ' + str(i) + '  条数据')
            word = keyword.split('\t')[0]
            # content = str(input("请输入关键词  :  "))
            word = str(word)
            baike(word)
            i=i+1
            if i % 20== 0:
                ret = random.uniform(3, 5)
                time.sleep(ret)
            duration = 50  # millisecond
            freq = 400  # Hz
            winsound.Beep(freq, duration)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值