python爬虫获取新华字典的数据

数据库字段设计
在这里插入图片描述
word_id:自动递增

python代码如下:


import requests
import pymysql
from bs4 import BeautifulSoup

def downloader(url):
    """
    下载汉字并保存
    """
    response = requests.get(url)

    if response.status_code != 200:
        print(f'{url} is failed!')
        return
    
    print(f'{url} is parsing')
    html = BeautifulSoup(response.content.decode('gbk', errors='ignore'), "lxml")
    a = html.find_all('a', target="_blank")

    prefix = 'http://www.zd9999.com'
    words = [prefix + w.get('href') for w in a]

    res = []
    for i in range(0, len(words)):
        response = requests.get(words[i])
        print(f'{[words[i]]} is parsing')
        if response.status_code != 200:
            print(f'{words[i]} is failed!')
            continue

        wordhtml = BeautifulSoup(response.content.decode('gbk', errors='ignore').replace('<br/>', '\n').replace('<br>', '\n')\
                     , "lxml")
        td = wordhtml.find_all('table')[4].find_all('td')
        word =  td[1].text.strip()
        oldword = td[4].text.strip()
        pinyin = td[8].text.strip()
        radicals = td[10].text.strip()
        explanation = td[12].text.strip()

       # 连接数据库
        conn = pymysql.connect(
            host='localhost',
            user='root',
            # 密码
            password='******',
            # 数据库名称
            db='*****',
            charset='utf8'
        )
        # python必须有一个游标对象,用来给数据库发送sql语句并执行
        # 创建游标对象
        cur = conn.cursor()
        # 对于数据库进行增删改查
        # insert into word(word,oldword,pinyin,radicals,explanation) values(%s,%s,%s,%s,%s)
        try:
            insert_sql = "insert into word(word,oldword,pinyin,radicals,explanation) values(%s,%s,%s,%s,%s)"
            param = (word, oldword, pinyin, radicals, explanation)
            cur.execute(insert_sql, param)
            conn.commit()
            print("插入数据成功;")
        except Exception as e:
            print("插入数据失败:", e)
            conn.rollback()
        finally:
            cur.close()

if __name__ == '__main__':
    downloader('http://www.zd9999.com/zi/index.htm')
    for i in range(2, 102):
        downloader(f'http://www.zd9999.com/zi/index_{i}.htm')

效果如下:
在这里插入图片描述

在这里插入图片描述

参考:https://github.com/pwxcoo/chinese-xinhua

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值