爬取百度词语的相关内容

最新推荐文章于 2020-11-20 20:44:56 发布

爱吃零食的水泥大仙

最新推荐文章于 2020-11-20 20:44:56 发布

阅读量333

点赞数 1

分类专栏：笔记文章标签： python xpath mysql

本文链接：https://blog.csdn.net/qq_37268093/article/details/109607395

版权

笔记专栏收录该内容

16 篇文章 1 订阅

订阅专栏

需求：

根据HSK词汇表搜索相关词语，并爬取其中的拼音，释义、同义/近义/反义词

使用语言及编译器：

python
pycharm

目标网站：

百度汉语：https://hanyu.baidu.com/

目标网页分析：

网页首页无任何东西，需要搜索进行跳转
在这里插入图片描述
F12查看JavaScript加载后的网页源代码

右击查看网页源代码

通过对比，网页搜索跳转以后加载的是静态网页。故不需要进行逆向分析或者使用selenium库。

ps.F12调出的是网页html代码的集合，并不是完整的网页html代码

爬取思路

1、获取页面
2、创建一个字典，用于存储爬取的相关数据
3、将字典存储为json文件，方便导入MySQL
4、连接数据库，使用for循环爬取HSK考试常用词组

实现代码

导入相关的包

import urllib.request
from lxml import etree
from urllib.parse import urlencode, unquote
import requests
import re
import json
import time
import pymysql

1、获取页面

def Net(url,headers):
    try:
        request = urllib.request.Request(url, headers=headers)
        html = urllib.request.urlopen(request, timeout=0.7).read().decode("utf8")
        return html
    except:
        time.sleep(10)
        Net(url,headers)

def get_baidu_page(kw,url):
    # 获取html页面
    #模拟请求头
    headers = {
        'Accept': 'text / html, application / xhtml + xml, application / xml,*/*;q = 0.9;q = 0.8',
        'Accept - Encoding': 'gzip, deflate, br',
        'Accept - Language': 'zh - CN, zh;q = 0.9',
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
    }
    html = Net(url,headers)
    content = etree.HTML(str(html))

2、创建字典

dic1 = {}
#因为字库中有些是单个字，有些是词语。网页对于字和词语有不同的页面布局。故加一个判断
if len(kw) == 1:
	link_list_pinyin = content.xpath('//div[@class="pronounce"]//b/text()')     #拼音
	link_list1 = content.xpath('//div//p/text()')            #详细信息
	link_synonym = content.xpath('//div[@id="synonym"]//a/text()')  # 近义词
	link_antonym = content.xpath('//div[@id="antonym"]//a/text()')  # 反义词
	link_redical = content.xpath('//li[@id="radical"]/span/text()')        #部首
	link_stroke = content.xpath('//li[@id="stroke_count"]/span/text()')        #笔画
	link_content = content.xpath('//div[@class="tab-content"]/a/text()')         #相关组词

	dic1["关键词"] = kw
	dic1['拼音'] = link_list_pinyin
	dic1["释义"] = link_list1
	dic1["近义词"] = link_synonym
	dic1["反义词"] = link_antonym
	dic1["部首"] = link_redical
	dic1["笔画"] = link_stroke
	dic1["相关组词"] = link_content
else:
	#获取详细信息
	link_list1 = content.xpath('//div//p/text()')
	link_list_pinyin = content.xpath('//div/dl/dt[@class="pinyin"]/text()')     #拼音
	link_synonym = content.xpath('//div[@id="synonym"]//a/text()')              #近义词
	link_antonym = content.xpath('//div[@id="antonym"]//a/text()')              #反义词

	dic1["关键词"] = kw
	dic1['拼音'] = link_list_pinyin
	dic1["释义"] = link_list1
	dic1["近义词"] = link_synonym
	dic1["反义词"] = link_antonym

3、存储文件

def save_file(dic):           #写入文件
    json_str = json.dumps(dic, ensure_ascii=False, indent=4)
    with open("result.json","a",encoding="utf8") as file1:
        file1.write(json_str)

4、连接数据库循环爬取

# 连接数据库
conn = pymysql.connect("localhost", "root", "123456", "sys")
cursor = conn.cursor()
sql = "select WORD from bucong"
cursor.execute(sql)
results = cursor.fetchall()
# kw = input("请输入要搜索的关键词： ")
for row in results[426:]:
	kw = row[0]
	print(kw)
	word = {"wd":kw}
	key = urllib.parse.urlencode(word)
	url = "https://hanyu.baidu.com/s"
	fullurl = url + "?" + key + "&ptype=zici"
	get_baidu_page(kw, fullurl)
	#对于有些字词百度汉语里面未收录，程序会报出异常。故需要加一个异常处理
	try:
		doSomething()
	except:
		pass

完整代码

"""一：百度词语爬虫"""
import urllib.request
from lxml import etree
from urllib.parse import urlencode, unquote
import requests
import re
import json
import time
import pymysql

def digui(url,headers):
    try:
        request = urllib.request.Request(url, headers=headers)
        html = urllib.request.urlopen(request, timeout=0.7).read().decode("utf8")
        return html
    except:
        time.sleep(10)
        digui(url,headers)


def get_baidu_page(kw,url):
    # 获取html页面
    #定义一个字典，存储我们想要的东西
    dic1 = {}
    """获取html页面"""
    #模拟请求头
    headers = {
        'Accept': 'text / html, application / xhtml + xml, application / xml,*/*;q = 0.9;q = 0.8',
        'Accept - Encoding': 'gzip, deflate, br',
        'Accept - Language': 'zh - CN, zh;q = 0.9',
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
    }
    html = digui(url,headers)
    content = etree.HTML(str(html))
    if len(kw) == 1:
        link_list_pinyin = content.xpath('//div[@class="pronounce"]//b/text()')     #拼音
        link_list1 = content.xpath('//div//p/text()')            #详细信息
        link_synonym = content.xpath('//div[@id="synonym"]//a/text()')  # 近义词
        link_antonym = content.xpath('//div[@id="antonym"]//a/text()')  # 反义词
        link_redical = content.xpath('//li[@id="radical"]/span/text()')        #部首
        link_stroke = content.xpath('//li[@id="stroke_count"]/span/text()')        #笔画
        link_content = content.xpath('//div[@class="tab-content"]/a/text()')         #相关组词

        dic1["关键词"] = kw
        dic1['拼音'] = link_list_pinyin
        dic1["释义"] = link_list1
        dic1["近义词"] = link_synonym
        dic1["反义词"] = link_antonym
        dic1["部首"] = link_redical
        dic1["笔画"] = link_stroke
        dic1["相关组词"] = link_content
    else:
        #获取详细信息
        link_list1 = content.xpath('//div//p/text()')

        link_list_pinyin = content.xpath('//div/dl/dt[@class="pinyin"]/text()')     #拼音
        link_synonym = content.xpath('//div[@id="synonym"]//a/text()')              #近义词
        link_antonym = content.xpath('//div[@id="antonym"]//a/text()')              #反义词


        dic1["关键词"] = kw
        dic1['拼音'] = link_list_pinyin
        dic1["释义"] = link_list1
        dic1["近义词"] = link_synonym
        dic1["反义词"] = link_antonym

    save_file(dic1)

def save_file(dic):           #写入文件
    json_str = json.dumps(dic, ensure_ascii=False, indent=4)
    with open("result.json","a",encoding="utf8") as file1:
        file1.write(json_str)



if __name__ == "__main__":
    """输入要搜索的关键词和对应的url地址"""
    # 连接数据库
    conn = pymysql.connect("localhost", "root", "123456", "sys")
    cursor = conn.cursor()
    sql = "select WORD from bucong"
    cursor.execute(sql)
    results = cursor.fetchall()
    # kw = input("请输入要搜索的关键词： ")
    for row in results[426:]:
        kw = row[0]
        print(kw)
        word = {"wd":kw}
        key = urllib.parse.urlencode(word)
        url = "https://hanyu.baidu.com/s"
        fullurl = url + "?" + key + "&ptype=zici"
        get_baidu_page(kw, fullurl)
        try:
            doSomething()
        except:
            pass

运行效果

在这里插入图片描述

爱吃零食的水泥大仙

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
2
评论
爬取百度词语的相关内容

需求：根据HSK词汇表搜索相关词语，并爬取其中的拼音，释义、同义/近义/反义词使用语言及编译器：pythonpycharm目标网站：百度汉语：https://hanyu.baidu.com/目标网页分析：网页首页无任何东西，需要搜索进行跳转F12查看JavaScript加载后的网页源代码右击查看网页源代码通过对比，网页搜索跳转以后加载的是静态网页。故不需要进行逆向分析或者使用selenium库。ps.F12调出的是网页html代码的集合，并不是完整的网页html代码爬取思路
复制链接

扫一扫