1、可以自定义或者从百度词库下载,所下载的文件是.bdict格式:
2、将.bdict格式的文件转为txt:参考这篇文章百度词库bdict、搜狗细胞词库scel 转 txt 格式
3、解析成功后得到如下词库文件:
4、根据关键词直接搜索,并将其存放到对应文件中,代码如下:
def crawler():
"""
爬取百度百科网页
:return:
"""
"取消python2与python3的兼容:https://blog.csdn.net/qq_57155967/article/details/126183852"
import requests
import urllib.parse
from lxml import etree
from tqdm import trange
import re
def query(content):
try:
# 请求地址
url = 'https://baike.baidu.com/item/' + urllib.parse.quote(content)
# 请求头部
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
# 利用请求地址和请求头部构造请求对象
response = requests.get(url, headers=headers)
if response.status_code == 200:
text = response.text
text = re.sub(r'\[.*?\]', '', text) # 删除 [1]
text = re.sub(r' ', '', text) # 删除
# 构造 _Element 对象
html = etree.HTML(text)
# 使用 xpath 匹配数据,得到匹配字符串列表
sen_list = html.xpath(
'//div[contains(@class,"lemma-summary") or contains(@class,"lemmaWgt-lemmaSummary")]//text()')
# 过滤数据,去掉空白
sen_list_after_filter = [item.strip('\n') for item in sen_list]
# 将字符串列表连成字符串并返回
return ''.join(sen_list_after_filter)
except Exception as e:
print(f"请求出错:{e}")
return None
if __name__ == '__main__':
#关键字存放的文件
diclist1 = [
'../../Datas/dataSet/txt/dict_file_1267_celebrity.txt',
'../../Datas/dataSet/txt/dict_file_1556_festival.txt',
'../../Datas/dataSet/txt/dict_file_4816_film.txt',
'../../Datas/dataSet/txt/dict_file_6196_cate.txt',
'../../Datas/dataSet/txt/dict_file_1091_opera.txt',
]
#爬取结果存放文件
diclist2 = [
'../../Datas/cnpedia/celebrity.txt',
'../../Datas/cnpedia/festival.txt',
'../../Datas/cnpedia/film.txt',
'../../Datas/cnpedia/cate.txt',
'../../Datas/cnpedia/opera.txt',
]
for i,dic1 in enumerate(diclist1):
baidu_test = open(diclist2[i], 'w', encoding='utf-8')
txt = open(dic1, 'r', encoding='utf-8').readlines()
for i in trange(len(txt)): #从百度百库获取文章并写入
content = txt[i].strip()
result = query(content)
if result != '':
baidu_test.write(result+'\n')
print("查询结果:%s" % result)
baidu_test.close()
注:对于爬取网页具体操作,可以参考小白如何入门 Python 爬虫?等文章
6、爬取结果: