爬取各个时代不同字体的汉字图片数据

最新推荐文章于 2024-08-29 14:47:26 发布

南乔°

最新推荐文章于 2024-08-29 14:47:26 发布

阅读量381

点赞数

分类专栏：爬虫文章标签： python 爬虫开发语言

本文链接：https://blog.csdn.net/weixin_45454897/article/details/129332960

版权

爬虫专栏收录该内容

1 篇文章 0 订阅

订阅专栏

import os
import requests

from bs4 import BeautifulSoup


# learning example:

# astr = '''aaaaa何时when 杖尔看see南雪snow，我me与梅花plum blossom两白头'''
# res = re.findall('[\u4e00-\u9fa5]', astr)
# print(res)

# a = []
# for i in range(0x4E00, 0x9FA6):
#     a.append(str(hex(i)).lstrip('0x'))
# print(a, len(a))

# unicode = b"\u" + chinese[0].encode("unicode-escape")
# print(unicode, unicode.decode("unicode-escape"))

#

# 处理生成对应的每个汉字 以及 每个汉字字典网址的请求url
urls = []            # 所有汉字的字典网址
chinese = []         # 所有汉字 (20902个)
for i in range(0x4E00, 0x9FA6):
    chinese.append((b"\u" + str(hex(i)).lstrip("0x").encode("unicode-escape")).decode("unicode-escape"))
    urls.append("http://www.guoxuedashi.net/zidian/" + str(hex(i)).lstrip("0x") + ".html")
print("共有{}个所要请求的url 以及 对应的{}个汉字".format(len(urls), len(chinese)))

# 请求头 伪装浏览器  python自带浏览器会被反爬
request_header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36"
}

# 汉字所要保存的路径
path = "E:\汉字数据集"

# 遍历请求每个汉字字典url
for (i, url), character in zip(enumerate(urls), chinese):
    response = requests.get(url, request_header, verify=False)
    print("******************第 {} 个 url --- {}".format(i+1, url), response, "******************")
    response.encoding = response.apparent_encoding
    response = response.text

    soup = BeautifulSoup(response, "html.parser")
    all_a = soup.find_all("a")

    href = "http://www.guoxuedashi.net"   # 得到这个汉字的演变url
    for a in all_a:
        if '  详情\r\n ' in list(a.children):
            # print(a) 测试用
            href += a["href"]

    r = requests.get(href, request_header, verify=False)
    print("******************", href, r, "******************")
    r.encoding = r.apparent_encoding
    r = r.text

    s = BeautifulSoup(r, "html.parser")
    imgs = s.find_all("img", {"width": "80"})

    print("****当前汉字\"{}\"文件夹***共有{}张汉字图片***正在生成...****".format(character, len(imgs)))

    # 如果没有这个汉字的图片 跳过创建这个汉字文件夹 执行下一个汉字
    if(len(imgs) == 0):
        continue

    for j, img in enumerate(imgs):
        src = img["src"]
        rp = requests.get(src, request_header, verify=False)

        chinese_path = os.path.join(path, character)
        if not os.path.exists(chinese_path):
            os.makedirs(chinese_path)

        img_path = os.path.join(chinese_path, str(j+1) + '.png')
        with open(img_path, 'wb+') as pic:
            pic.write(rp.content)
    print("****当前汉字\"{}\"文件夹***共有{}张汉字图片***生成完毕!!!****".format(character, len(imgs)))