python 爬虫遇到字体加密解决方案(woff2) ddddocr + 百度ocr 文字识别

第一步,找到目标网站的font链接,下载对应的 woff2文件。

第二步,使用 High-Logic FontCreator”工具打开(下载链接:Download font software (high-logic.com)

文件打开是这样的:显示有 363个文字。

第三步,切割单个文字,代码如下:

# -*- coding: utf-8 -*-

import os
from fontTools.ttLib import TTFont
from fontTools.pens.freetypePen import FreeTypePen
import matplotlib.pyplot as plt


# 拆解woff2文件,保存为单个字体图片:保存至 imgs 文件夹
def font_split_single_img():
    # 解析字体文件
    font = TTFont('e26e946d8b2ccb7.woff2')  # woff2文件
    cmap = font.getBestCmap()
    # font.saveXML('font.xml')  # 保存存为xml
    index = 1
    for n, v in cmap.items():
        d = v
        glyph = font.getGlyphSet()[d]  # 通过字形名称选择某一字形对象
        pen = FreeTypePen(None)  # 实例化Pen子类
        glyph.draw(pen)  # “画”出字形轮廓
        # pen.show()    # 显示
        b = pen.array()
        print(index, '/', len(cmap), '~~~', glyph)
        plt.figure()
        plt.imshow(b)
        plt.axis('off')  # 禁用坐标轴
        os.makedirs('imgs', exist_ok=True)
        plt.savefig('./imgs/{0}.jpg'.format(d))
        # plt.show()    # 显示
        plt.clf()
        plt.cla()
        plt.close()
        index += 1

切割后的结果,注意命名:

第四步,图片识别字符(dddddocr + 百度ocr)

        1、pip install ddddocr 安装

import ddddocr
from PIL import Image


# 用 ddddocr 识别图片文字,保存至 imgs_copy_word 文件夹
def ocrWords():
    ocr = ddddocr.DdddOcr(beta=False, show_ad=False)  # 识别
    word_map = {}
    for parent, dirnames, filenames in os.walk('imgs'):  # 遍历每一张图片
        for filename in filenames:
            k = filename.split('.')[0]
            currentPath = os.path.join(parent, filename)
            with open(currentPath, 'rb') as f:
                image = f.read()
            res = ocr.classification(image)
            if len(res) == 0:
                res = '未找到'
            if len(res) > 1:
                res = res[0]
            print(k, 'res:', res)
            os.makedirs('imgs_copy_word', exist_ok=True)
            d = f'{k}__{res}.jpg'
            img = Image.open(currentPath)
            img.save('imgs_copy_word/%s' % d)
            word_map[k] = res

识别效果如下:

2、百度OCR识别(链接:网络图片识别_图片文字识别_图片转文字-百度AI开放平台)需要账号登录。

代码如下,注意,需要添加登录后的cookie:

# 用 百度ocr接口解析图片,保存至 imgs_copy_word_bdu 文件夹
def ocrWords_baidu_ocr():
    url = 'https://ai.baidu.com/aidemo'
    headers = {
    "Accept": "*/*",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Connection": "keep-alive",
    "Content-Type": "application/x-www-form-urlencoded",
    "Cookie": "你登录后的cookie",
    "Host": "ai.baidu.com",
    "Referer": "https://ai.baidu.com/tech/ocr_others/webimage?_=1694671106471",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.43"
}

    word_map = {}
    index = 0
    for parent, dirnames, filenames in os.walk('imgs'):  # 遍历每一张图片
        for filename in filenames:
            index += 1
            # if index <= 332:
            #     continue
            k = filename.split('.')[0]
            currentPath = os.path.join(parent, filename)
            with open(currentPath, 'rb') as f:
                image = f.read()
            bs64Img = base64.b64encode(image)
            bs64Img = 'data:image/jpeg;base64,%s' % bs64Img.decode()
            postData = {
                "image": bs64Img,
                "image_url": "",
                "type": "https://aip.baidubce.com/rest/2.0/ocr/v1/webimage"
            }
            response = requests.post(url=url, headers=headers, data=postData)
            content = json.loads(response.text.strip())
            if content['msg'] == 'success':
                if int(content['data']['words_result_num']) == 0:
                    res = '未找到Result0'
                else:
                    res = content['data']['words_result'][0]['words']
            else:
                res = '未找到'
            print(k, 'res:', res, index, len(filenames))
            os.makedirs('imgs_copy_word_bdu', exist_ok=True)
            d = f'{k}__{res}.jpg'
            img = Image.open(currentPath)
            img.save('imgs_copy_word_bdu/%s' % d)
            word_map[k] = res
            time.sleep(0.4)

识别后的结果如图:

第五步, 合并比较前两步识别的结果,imgs_copy_word、imgs_copy_word_bdu,代码如下:

根据识别后的名称,提取结果,并保存为 .json文件:dddddocr识别的保存为:ocr_dddd.json,百度ocr识别的,保存为:ocr_baidu.json
# 根据识别后的名称,提取结果,并保存为 .json文件:dddddocr识别的保存为:ocr_dddd.json,百度ocr识别的,保存为:ocr_baidu.json
def readImagName(imagesPath='imgs_copy_word', saveJsonName='ocr_dddd.json'):
    word_map = {}
    for parent, dirnames, filenames in os.walk(imagesPath):  # 遍历每一张图片
        for filename in filenames:
            k = filename.split('.')[0]
            res = k.split('__')[1]
            word_map[k.split('__')[0]] = res
    if word_map:
        with open(saveJsonName, 'w', encoding='utf-8') as f:
            f.write(json.dumps(word_map, ensure_ascii=False))

文件结果如下部分:

{
  "gid58344": "体",
  "gid58345": "y",
  "gid58346": "十",
  "gid58347": "现",
  "gid58348": "快",
  "gid58349": "便",
  "gid58350": "话",
  "gid58351": "却",
  "gid58352": "月",
  "gid58353": "物",
  "gid58354": "水",
  "gid58355": "的",
  "gid58356": "放",
  "gid58357": "知",
  "gid58358": "爱",
  "gid58359": "万",
    ......
    ......
    ......
    ......
}

第六步,对比差异,代码如下:结果保存为  ocr_diff.json

# 对比两个 json 文件,找出不同的结果,大家也可以用其他方法。文件:ocr_diff.json
def jsonDiff(path_ddddocr='ocr_dddd.json', path_baidu_ocr='ocr_baidu.json'):
    with open(path_ddddocr, 'r', encoding='utf-8') as f:
        json_ddddocr = json.loads(f.read())
    with open(path_baidu_ocr, 'r', encoding='utf-8') as f:
        json_bduocr = json.loads(f.read())

    word_map_diff = {}
    for n, v in json_ddddocr.items():
        v2 = json_bduocr[n]
        if v == v2:
            print(n, v)
        elif v.upper() == v2:
            print(n, v)
        else:
            print('不一致')
            word_map_diff[n] = f'{v}, {v2}'
    # 保存结果:不一致
    if word_map_diff:
        with open('ocr_diff.json', 'w', encoding='utf-8') as f:
            f.write(json.dumps(word_map_diff, ensure_ascii=False))

第七步,找出识别差异的图片,移动至 新文件夹:imgs_copy_word_diff

# 根据结果差异名称:ocr_diff.json,筛选出识别结果差异图片文件,移动至 imgs_copy_word_diff 文件夹,并删除原始差异文件
def move_diffImg():
    with open('ocr_diff.json', 'r', encoding='utf-8') as f:
        r = json.loads(f.read())
    for n, v in r.items():
        print(n, v)
        vs = v.split(',')
        n1 = f'{n}__{vs[0]}.jpg'
        n2 = f'{n}__{vs[1].strip()}.jpg'
        try:
            currentPath1 = f'imgs_copy_word/{n1}'
            currentPath2 = f'imgs_copy_word_bdu/{n2}'
            os.makedirs('imgs_copy_word_diff', exist_ok=True)
            img = Image.open(currentPath1)
            img.save('imgs_copy_word_diff/%s' % n1)
            img = Image.open(currentPath2)
            img.save('imgs_copy_word_diff/%s' % n2)
            os.remove(currentPath1)     # 删除差异文件
            os.remove(currentPath2)     # 删除差异文件
        except:
            pass

结果如图:这部分需要人工查看挑选,修改对应的结果。

挑选过后,把这部分正确的文件,分别复制到 imgs_copy_word、imgs_copy_word_bdu 文件夹内。到此为止,图片打标签完成。

第八步,最后。重新执行一次  readImagName() 或者 readImagName(imagesPath='imgs_copy_word_bdu', saveJsonName='ocr_baidu.json')

重新得到 ocr_dddd.json 或者 ocr_baidu.json

这里面就是最终得到的字典映射。

使用:如图接口返回的为字体加密数据:

# 使用
def _use():
    with open('ocr_dddd.json', 'r', encoding='utf-8') as f:
        words_map = json.loads(f.read())
    # 对应的接口
    url = '链接就不放出来了。。。。。。'
    response = requests.get(url)
    content = response.content.decode('utf-8')
    font = TTFont('e26e946d8b2ccb7-500.woff2')
    cmap = font.getBestCmap()
    txt = json.loads(content)
    b = txt['data']['book_list'][0]['read_count']
    words = []
    for itm in b:
        try:
            d = cmap[ord(itm)]
            word = words_map[d]
            # print(word)
            words.append(word)
        except:
            # d = -1
            word = -1
            words.append(itm.strip())
        # print(itm, '--->', ord(itm), '----->: d ', d, 'word: ', word)

    print(''.join(words))

打印结果

  • 3
    点赞
  • 12
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值