注:medicine.dic - 副本.default 是一个17万行的文本,如:
感冒 11111111
发烧 2222222
import urllib.request
import urllib.parse
import re
from bs4 import BeautifulSoup
from lxml import etree
import gc
import random
import time
import winsound
def baike(word):
def test_url(soup): # 检测是否收录该词条,返回 True or False
result = soup.find(text=re.compile("百度百科未收录该词条"))
del soup
if result:
return False
else:
return True
def parse_wirte_info(html,word):
# 1、h1标签,每个搜索都有数据
h1_word=''
smilar_word=''
h1s=html.xpath('//dl[@class="lemmaWgt-lemmaTitle lemmaWgt-lemmaTitle-"]//h1')
if (len(h1s)==1):
h1_word = h1s[0].text
# print('本次搜索结果中h1标签是:'+h1_word + '\n')
#同义词
smilar_words = html.xpath('//span[@class="view-tip-panel"]')
if (len(smilar_words) == 1):
smilar_word = smilar_words[0].xpath('string(.)').replace('同义词', '', 1).strip()
# print('本次搜索结果中同义词有:' + smilar_word + '\n')
# 2、基本信息模块,部分搜索有数据
basic_info={}
basic_info_lists1 = html.xpath('//div[@class="basic-info cmn-clearfix"]//dt')
basic_info_lists2 = html.xpath('//div[@class="basic-info cmn-clearfix"]//dd')
for i in range(len(basic_info_lists1)):
key1 = basic_info_lists1[i].text.strip()
value1 = basic_info_lists2[i].text.strip()
basic_info[key1]=value1
# print(key1+':'+value1)
# 3、目录模块
total_lists1 = html.xpath('//div[@class="lemma-catalog"]/div/ol/li/span[1]')
total_lists2 = html.xpath('//div[@class="lemma-catalog"]/div/ol/li/span[2]/a')
total = []
for i in range(len(total_lists2)):
key1 = total_lists1[i].text
value1 = total_lists2[i].text
if key1 in (
'1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19',
'20'):
total.append('\n')
total.append(value1)
string = ''
for i in range(len(total)):
string = string + total[i] + '\t'
string = string.strip()
listskey_values = string.split('\n')
catalog = []
for listskey_value in listskey_values:
sub_listskys = listskey_value.split('\t')
while '' in sub_listskys:
sub_listskys.remove('')
catalog.append(sub_listskys)
# print(catalog)
# 4、正文提取
level2 = html.xpath('//div[@class="main-content"]')
if len(level2) ==1:
main_content = etree.tostring(level2[0], encoding='utf-8') # 比较核心的语句etree.tostring()
lis = main_content.decode()
content = {}
h2_lists = lis.split('<div class="para-title level-2" label-module="para-title">')
# ①不存在h2
if len(h2_lists) == 1:
sub_value01 = ''
contents = etree.HTML(h2_lists[0]).xpath('//div[@class="para"]')
for k in range(len(contents)):
sub_value01 = sub_value01 + contents[k].xpath('string(.)').strip()
# print(contents[k].xpath('string(.)').strip()) # 比较核心的语句string(.)
content = {'no_h2': sub_value01}
# ②存在h2
elif len(h2_lists) > 1:
for i in range(len(h2_lists[1:])):
h2 = etree.HTML(h2_lists[i + 1]).xpath('//h2[@class="title-text"]/text()')
# print(i)
# print(str(h2[0]))
cont_text = ''
contents = etree.HTML(h2_lists[i + 1]).xpath('//div[@class="para"]')
for k in range(len(contents)):
cont_text = cont_text + contents[k].xpath('string(.)').strip()
content[str(h2[0])] = cont_text
dict = {
'word': word,
'top_word': h1_word,
'similar_word': smilar_word,
'basic_info': basic_info,
'catalog': catalog,
'content': content,
}
with open('medicine_result.txt', 'a', encoding='utf-8')as fp:
fp.write(str(dict) + '\n')
def start(word):
keyword = urllib.parse.urlencode({"word": word}) # 解析,用于组成URL
print("http://baike.baidu.com/search/word?%s" % keyword)
response = urllib.request.urlopen("http://baike.baidu.com/search/word?%s" % keyword,timeout=2000)
html = response.read()
soup = BeautifulSoup(html, "html.parser")
html = etree.HTML(html)
if test_url(soup):
parse_wirte_info(html,word)
response.close() # 注意关闭response
try:
start(word)
gc.collect()
except AttributeError:
print("百度百科未收录该词条")
if (__name__ == "__main__"):
# 文件存放的绝对路径
# 结果result.txt存放在当前目录下
file_path=r'C:\Users\xu134\Documents\WeChat Files\xuyong95901\FileStorage\File\2020-07\medicine.dic - 副本.default'
with open(file_path, 'r', encoding='utf-8') as f:
keywords = f.readlines()
i = 1
for keyword in keywords:
print('正在下载第 ' + str(i) + ' 条数据')
word = keyword.split('\t')[0]
# content = str(input("请输入关键词 : "))
word = str(word)
baike(word)
i=i+1
if i % 20== 0:
ret = random.uniform(3, 5)
time.sleep(ret)
duration = 50 # millisecond
freq = 400 # Hz
winsound.Beep(freq, duration)