python 爬虫遇到字体加密解决方案(woff2) ddddocr + 百度ocr 文字识别

最新推荐文章于 2024-07-31 15:55:27 发布

lewis@110

最新推荐文章于 2024-07-31 15:55:27 发布

阅读量2.7k

点赞数 3

分类专栏：爬虫文章标签： python 爬虫

本文链接：https://blog.csdn.net/huagangwang/article/details/132887899

版权

爬虫专栏收录该内容

3 篇文章 0 订阅

订阅专栏

第一步，找到目标网站的font链接，下载对应的 woff2文件。

第二步，使用 High-Logic FontCreator”工具打开（下载链接：Download font software (high-logic.com)）

文件打开是这样的：显示有 363个文字。

第三步，切割单个文字，代码如下：

# -*- coding: utf-8 -*-

import os
from fontTools.ttLib import TTFont
from fontTools.pens.freetypePen import FreeTypePen
import matplotlib.pyplot as plt


# 拆解woff2文件，保存为单个字体图片:保存至 imgs 文件夹
def font_split_single_img():
    # 解析字体文件
    font = TTFont('e26e946d8b2ccb7.woff2')  # woff2文件
    cmap = font.getBestCmap()
    # font.saveXML('font.xml')  # 保存存为xml
    index = 1
    for n, v in cmap.items():
        d = v
        glyph = font.getGlyphSet()[d]  # 通过字形名称选择某一字形对象
        pen = FreeTypePen(None)  # 实例化Pen子类
        glyph.draw(pen)  # “画”出字形轮廓
        # pen.show()    # 显示
        b = pen.array()
        print(index, '/', len(cmap), '~~~', glyph)
        plt.figure()
        plt.imshow(b)
        plt.axis('off')  # 禁用坐标轴
        os.makedirs('imgs', exist_ok=True)
        plt.savefig('./imgs/{0}.jpg'.format(d))
        # plt.show()    # 显示
        plt.clf()
        plt.cla()
        plt.close()
        index += 1

切割后的结果，注意命名：

第四步，图片识别字符（dddddocr + 百度ocr）

1、pip install ddddocr 安装

import ddddocr
from PIL import Image


# 用 ddddocr 识别图片文字,保存至 imgs_copy_word 文件夹
def ocrWords():
    ocr = ddddocr.DdddOcr(beta=False, show_ad=False)  # 识别
    word_map = {}
    for parent, dirnames, filenames in os.walk('imgs'):  # 遍历每一张图片
        for filename in filenames:
            k = filename.split('.')[0]
            currentPath = os.path.join(parent, filename)
            with open(currentPath, 'rb') as f:
                image = f.read()
            res = ocr.classification(image)
            if len(res) == 0:
                res = '未找到'
            if len(res) > 1:
                res = res[0]
            print(k, 'res:', res)
            os.makedirs('imgs_copy_word', exist_ok=True)
            d = f'{k}__{res}.jpg'
            img = Image.open(currentPath)
            img.save('imgs_copy_word/%s' % d)
            word_map[k] = res

识别效果如下：

2、百度OCR识别（链接：网络图片识别_图片文字识别_图片转文字-百度AI开放平台）需要账号登录。

代码如下，注意，需要添加登录后的cookie：

# 用 百度ocr接口解析图片，保存至 imgs_copy_word_bdu 文件夹
def ocrWords_baidu_ocr():
    url = 'https://ai.baidu.com/aidemo'
    headers = {
    "Accept": "*/*",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Connection": "keep-alive",
    "Content-Type": "application/x-www-form-urlencoded",
    "Cookie": "你登录后的cookie",
    "Host": "ai.baidu.com",
    "Referer": "https://ai.baidu.com/tech/ocr_others/webimage?_=1694671106471",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.43"
}

    word_map = {}
    index = 0
    for parent, dirnames, filenames in os.walk('imgs'):  # 遍历每一张图片
        for filename in filenames:
            index += 1
            # if index <= 332:
            #     continue
            k = filename.split('.')[0]
            currentPath = os.path.join(parent, filename)
            with open(currentPath, 'rb') as f:
                image = f.read()
            bs64Img = base64.b64encode(image)
            bs64Img = 'data:image/jpeg;base64,%s' % bs64Img.decode()
            postData = {
                "image": bs64Img,
                "image_url": "",
                "type": "https://aip.baidubce.com/rest/2.0/ocr/v1/webimage"
            }
            response = requests.post(url=url, headers=headers, data=postData)
            content = json.loads(response.text.strip())
            if content['msg'] == 'success':
                if int(content['data']['words_result_num']) == 0:
                    res = '未找到Result0'
                else:
                    res = content['data']['words_result'][0]['words']
            else:
                res = '未找到'
            print(k, 'res:', res, index, len(filenames))
            os.makedirs('imgs_copy_word_bdu', exist_ok=True)
            d = f'{k}__{res}.jpg'
            img = Image.open(currentPath)
            img.save('imgs_copy_word_bdu/%s' % d)
            word_map[k] = res
            time.sleep(0.4)

识别后的结果如图：

第五步， 合并比较前两步识别的结果，imgs_copy_word、imgs_copy_word_bdu，代码如下：

根据识别后的名称，提取结果，并保存为 .json文件：dddddocr识别的保存为：ocr_dddd.json,百度ocr识别的，保存为：ocr_baidu.json

# 根据识别后的名称，提取结果，并保存为 .json文件：dddddocr识别的保存为：ocr_dddd.json,百度ocr识别的，保存为：ocr_baidu.json
def readImagName(imagesPath='imgs_copy_word', saveJsonName='ocr_dddd.json'):
    word_map = {}
    for parent, dirnames, filenames in os.walk(imagesPath):  # 遍历每一张图片
        for filename in filenames:
            k = filename.split('.')[0]
            res = k.split('__')[1]
            word_map[k.split('__')[0]] = res
    if word_map:
        with open(saveJsonName, 'w', encoding='utf-8') as f:
            f.write(json.dumps(word_map, ensure_ascii=False))

文件结果如下部分：

{
  "gid58344": "体",
  "gid58345": "y",
  "gid58346": "十",
  "gid58347": "现",
  "gid58348": "快",
  "gid58349": "便",
  "gid58350": "话",
  "gid58351": "却",
  "gid58352": "月",
  "gid58353": "物",
  "gid58354": "水",
  "gid58355": "的",
  "gid58356": "放",
  "gid58357": "知",
  "gid58358": "爱",
  "gid58359": "万"，
    ......
    ......
    ......
    ......
}

第六步，对比差异，代码如下：结果保存为 ocr_diff.json

# 对比两个 json 文件，找出不同的结果，大家也可以用其他方法。文件：ocr_diff.json
def jsonDiff(path_ddddocr='ocr_dddd.json', path_baidu_ocr='ocr_baidu.json'):
    with open(path_ddddocr, 'r', encoding='utf-8') as f:
        json_ddddocr = json.loads(f.read())
    with open(path_baidu_ocr, 'r', encoding='utf-8') as f:
        json_bduocr = json.loads(f.read())

    word_map_diff = {}
    for n, v in json_ddddocr.items():
        v2 = json_bduocr[n]
        if v == v2:
            print(n, v)
        elif v.upper() == v2:
            print(n, v)
        else:
            print('不一致')
            word_map_diff[n] = f'{v}, {v2}'
    # 保存结果：不一致
    if word_map_diff:
        with open('ocr_diff.json', 'w', encoding='utf-8') as f:
            f.write(json.dumps(word_map_diff, ensure_ascii=False))

第七步，找出识别差异的图片，移动至新文件夹：imgs_copy_word_diff

# 根据结果差异名称：ocr_diff.json，筛选出识别结果差异图片文件，移动至 imgs_copy_word_diff 文件夹，并删除原始差异文件
def move_diffImg():
    with open('ocr_diff.json', 'r', encoding='utf-8') as f:
        r = json.loads(f.read())
    for n, v in r.items():
        print(n, v)
        vs = v.split(',')
        n1 = f'{n}__{vs[0]}.jpg'
        n2 = f'{n}__{vs[1].strip()}.jpg'
        try:
            currentPath1 = f'imgs_copy_word/{n1}'
            currentPath2 = f'imgs_copy_word_bdu/{n2}'
            os.makedirs('imgs_copy_word_diff', exist_ok=True)
            img = Image.open(currentPath1)
            img.save('imgs_copy_word_diff/%s' % n1)
            img = Image.open(currentPath2)
            img.save('imgs_copy_word_diff/%s' % n2)
            os.remove(currentPath1)     # 删除差异文件
            os.remove(currentPath2)     # 删除差异文件
        except:
            pass

结果如图：这部分需要人工查看挑选，修改对应的结果。

挑选过后，把这部分正确的文件，分别复制到 imgs_copy_word、imgs_copy_word_bdu 文件夹内。到此为止，图片打标签完成。

第八步，最后。重新执行一次 readImagName() 或者 readImagName(imagesPath='imgs_copy_word_bdu', saveJsonName='ocr_baidu.json')

重新得到 ocr_dddd.json 或者 ocr_baidu.json

这里面就是最终得到的字典映射。

使用：如图接口返回的为字体加密数据：

# 使用
def _use():
    with open('ocr_dddd.json', 'r', encoding='utf-8') as f:
        words_map = json.loads(f.read())
    # 对应的接口
    url = '链接就不放出来了。。。。。。'
    response = requests.get(url)
    content = response.content.decode('utf-8')
    font = TTFont('e26e946d8b2ccb7-500.woff2')
    cmap = font.getBestCmap()
    txt = json.loads(content)
    b = txt['data']['book_list'][0]['read_count']
    words = []
    for itm in b:
        try:
            d = cmap[ord(itm)]
            word = words_map[d]
            # print(word)
            words.append(word)
        except:
            # d = -1
            word = -1
            words.append(itm.strip())
        # print(itm, '--->', ord(itm), '----->: d ', d, 'word: ', word)

    print(''.join(words))

打印结果：