"""一:百度词语爬虫"""
import urllib.request
from lxml import etree
from urllib.parse import urlencode, unquote
import requests
import re
import json
import time
def digui(url,headers):
try:
request = urllib.request.Request(url, headers=headers)
html = urllib.request.urlopen(request, timeout=0.7).read().decode("utf8")
return html
except:
time.sleep(10)
digui(url,headers)
def load_baidu_page(kw,url):
# 获取html页面
dic1 = {}
"""获取html页面"""
headers = {
'Accept': 'text / html, application / xhtml + xml, application / xml,*/*;q = 0.9;q = 0.8',
'Accept - Encoding': 'gzip, deflate, br',
'Accept - Language': 'zh - CN, zh;q = 0.9',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
}
html = digui(url,headers)
content = etree.HTML(str(html))
if len(kw) == 1:
link_list_pinyin = content.xpath('//div[@class="pronounce"]//b/text()') #拼音
link_list1 = content.xpath('//div//p/text()') #详细信息
link_synonym = content.xpath('//div[@id="synonym"]//a/text()') # 近义词
link_antonym = content.xpath('//div[@id="antonym"]//a/text()') # 反义词
link_redical = content.xpath('//li[@id="radical"]/span/text()') #部首
link_stroke = content.xpath('//li[@id="stroke_count"]/span/text()') #笔画
link_content = content.xpath('//div[@class="tab-content"]/a/text()') #相关组词
dic1["关键词"] = kw
dic1['拼音'] = link_list_pinyin
dic1["释义"] = link_list1
dic1["近义词"] = link_synonym
dic1["反义词"] = link_antonym
dic1["部首"] = link_redical
dic1["笔画"] = link_stroke
dic1["相关组词"] = link_content
else:
#获取详细信息
link_list1 = content.xpath('//div//p/text()')
link_list_pinyin = content.xpath('//div/dl/dt[@class="pinyin"]/text()') #拼音
link_synonym = content.xpath('//div[@id="synonym"]//a/text()') #近义词
link_antonym = content.xpath('//div[@id="antonym"]//a/text()') #反义词
dic1["关键词"] = kw
dic1['拼音'] = link_list_pinyin
dic1["释义"] = link_list1
dic1["近义词"] = link_synonym
dic1["反义词"] = link_antonym
save_file(dic1)
def save_file(dic): #写入文件
json_str = json.dumps(dic, ensure_ascii=False, indent=4)
with open("result.json","a",encoding="utf8") as file1:
file1.write(json_str)
if __name__ == "__main__":
"""输入要搜索的关键词和对应的url地址"""
# kw = input("请输入要搜索的关键词: ")
url = "https://hanyu.baidu.com/s"
with open("现代汉语常用词表.txt","r",encoding="utf8") as file2:
lines = file2.readlines()
for line in lines:
line = line.split()
if len(line) == 0:
continue
kw = line[0]
print(kw)
word = {"wd": kw}
key = urllib.parse.urlencode(word)
fullurl = url + "?" + key + "&ptype=zici" #完整url
load_baidu_page(kw,fullurl) # 进入到百度搜索结果的页面
分字词两种情况分别处理,用的语料为现在汉语常用词共56000多条。
处理结果: