第一步,找到目标网站的font链接,下载对应的 woff2文件。
第二步,使用 High-Logic FontCreator”工具打开(下载链接:Download font software (high-logic.com))
文件打开是这样的:显示有 363个文字。
第三步,切割单个文字,代码如下:
# -*- coding: utf-8 -*-
import os
from fontTools.ttLib import TTFont
from fontTools.pens.freetypePen import FreeTypePen
import matplotlib.pyplot as plt
# 拆解woff2文件,保存为单个字体图片:保存至 imgs 文件夹
def font_split_single_img():
# 解析字体文件
font = TTFont('e26e946d8b2ccb7.woff2') # woff2文件
cmap = font.getBestCmap()
# font.saveXML('font.xml') # 保存存为xml
index = 1
for n, v in cmap.items():
d = v
glyph = font.getGlyphSet()[d] # 通过字形名称选择某一字形对象
pen = FreeTypePen(None) # 实例化Pen子类
glyph.draw(pen) # “画”出字形轮廓
# pen.show() # 显示
b = pen.array()
print(index, '/', len(cmap), '~~~', glyph)
plt.figure()
plt.imshow(b)
plt.axis('off') # 禁用坐标轴
os.makedirs('imgs', exist_ok=True)
plt.savefig('./imgs/{0}.jpg'.format(d))
# plt.show() # 显示
plt.clf()
plt.cla()
plt.close()
index += 1
切割后的结果,注意命名:
第四步,图片识别字符(dddddocr + 百度ocr)
1、pip install ddddocr 安装
import ddddocr
from PIL import Image
# 用 ddddocr 识别图片文字,保存至 imgs_copy_word 文件夹
def ocrWords():
ocr = ddddocr.DdddOcr(beta=False, show_ad=False) # 识别
word_map = {}
for parent, dirnames, filenames in os.walk('imgs'): # 遍历每一张图片
for filename in filenames:
k = filename.split('.')[0]
currentPath = os.path.join(parent, filename)
with open(currentPath, 'rb') as f:
image = f.read()
res = ocr.classification(image)
if len(res) == 0:
res = '未找到'
if len(res) > 1:
res = res[0]
print(k, 'res:', res)
os.makedirs('imgs_copy_word', exist_ok=True)
d = f'{k}__{res}.jpg'
img = Image.open(currentPath)
img.save('imgs_copy_word/%s' % d)
word_map[k] = res
识别效果如下:
2、百度OCR识别(链接:网络图片识别_图片文字识别_图片转文字-百度AI开放平台)需要账号登录。
代码如下,注意,需要添加登录后的cookie:
# 用 百度ocr接口解析图片,保存至 imgs_copy_word_bdu 文件夹
def ocrWords_baidu_ocr():
url = 'https://ai.baidu.com/aidemo'
headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Connection": "keep-alive",
"Content-Type": "application/x-www-form-urlencoded",
"Cookie": "你登录后的cookie",
"Host": "ai.baidu.com",
"Referer": "https://ai.baidu.com/tech/ocr_others/webimage?_=1694671106471",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.43"
}
word_map = {}
index = 0
for parent, dirnames, filenames in os.walk('imgs'): # 遍历每一张图片
for filename in filenames:
index += 1
# if index <= 332:
# continue
k = filename.split('.')[0]
currentPath = os.path.join(parent, filename)
with open(currentPath, 'rb') as f:
image = f.read()
bs64Img = base64.b64encode(image)
bs64Img = 'data:image/jpeg;base64,%s' % bs64Img.decode()
postData = {
"image": bs64Img,
"image_url": "",
"type": "https://aip.baidubce.com/rest/2.0/ocr/v1/webimage"
}
response = requests.post(url=url, headers=headers, data=postData)
content = json.loads(response.text.strip())
if content['msg'] == 'success':
if int(content['data']['words_result_num']) == 0:
res = '未找到Result0'
else:
res = content['data']['words_result'][0]['words']
else:
res = '未找到'
print(k, 'res:', res, index, len(filenames))
os.makedirs('imgs_copy_word_bdu', exist_ok=True)
d = f'{k}__{res}.jpg'
img = Image.open(currentPath)
img.save('imgs_copy_word_bdu/%s' % d)
word_map[k] = res
time.sleep(0.4)
识别后的结果如图:
第五步, 合并比较前两步识别的结果,imgs_copy_word、imgs_copy_word_bdu,代码如下:
根据识别后的名称,提取结果,并保存为 .json文件:dddddocr识别的保存为:ocr_dddd.json,百度ocr识别的,保存为:ocr_baidu.json
# 根据识别后的名称,提取结果,并保存为 .json文件:dddddocr识别的保存为:ocr_dddd.json,百度ocr识别的,保存为:ocr_baidu.json
def readImagName(imagesPath='imgs_copy_word', saveJsonName='ocr_dddd.json'):
word_map = {}
for parent, dirnames, filenames in os.walk(imagesPath): # 遍历每一张图片
for filename in filenames:
k = filename.split('.')[0]
res = k.split('__')[1]
word_map[k.split('__')[0]] = res
if word_map:
with open(saveJsonName, 'w', encoding='utf-8') as f:
f.write(json.dumps(word_map, ensure_ascii=False))
文件结果如下部分:
{
"gid58344": "体",
"gid58345": "y",
"gid58346": "十",
"gid58347": "现",
"gid58348": "快",
"gid58349": "便",
"gid58350": "话",
"gid58351": "却",
"gid58352": "月",
"gid58353": "物",
"gid58354": "水",
"gid58355": "的",
"gid58356": "放",
"gid58357": "知",
"gid58358": "爱",
"gid58359": "万",
......
......
......
......
}
第六步,对比差异,代码如下:结果保存为 ocr_diff.json
# 对比两个 json 文件,找出不同的结果,大家也可以用其他方法。文件:ocr_diff.json
def jsonDiff(path_ddddocr='ocr_dddd.json', path_baidu_ocr='ocr_baidu.json'):
with open(path_ddddocr, 'r', encoding='utf-8') as f:
json_ddddocr = json.loads(f.read())
with open(path_baidu_ocr, 'r', encoding='utf-8') as f:
json_bduocr = json.loads(f.read())
word_map_diff = {}
for n, v in json_ddddocr.items():
v2 = json_bduocr[n]
if v == v2:
print(n, v)
elif v.upper() == v2:
print(n, v)
else:
print('不一致')
word_map_diff[n] = f'{v}, {v2}'
# 保存结果:不一致
if word_map_diff:
with open('ocr_diff.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(word_map_diff, ensure_ascii=False))
第七步,找出识别差异的图片,移动至 新文件夹:imgs_copy_word_diff
# 根据结果差异名称:ocr_diff.json,筛选出识别结果差异图片文件,移动至 imgs_copy_word_diff 文件夹,并删除原始差异文件
def move_diffImg():
with open('ocr_diff.json', 'r', encoding='utf-8') as f:
r = json.loads(f.read())
for n, v in r.items():
print(n, v)
vs = v.split(',')
n1 = f'{n}__{vs[0]}.jpg'
n2 = f'{n}__{vs[1].strip()}.jpg'
try:
currentPath1 = f'imgs_copy_word/{n1}'
currentPath2 = f'imgs_copy_word_bdu/{n2}'
os.makedirs('imgs_copy_word_diff', exist_ok=True)
img = Image.open(currentPath1)
img.save('imgs_copy_word_diff/%s' % n1)
img = Image.open(currentPath2)
img.save('imgs_copy_word_diff/%s' % n2)
os.remove(currentPath1) # 删除差异文件
os.remove(currentPath2) # 删除差异文件
except:
pass
结果如图:这部分需要人工查看挑选,修改对应的结果。
挑选过后,把这部分正确的文件,分别复制到 imgs_copy_word、imgs_copy_word_bdu 文件夹内。到此为止,图片打标签完成。
第八步,最后。重新执行一次 readImagName() 或者 readImagName(imagesPath='imgs_copy_word_bdu', saveJsonName='ocr_baidu.json')
重新得到 ocr_dddd.json 或者 ocr_baidu.json
这里面就是最终得到的字典映射。
使用:如图接口返回的为字体加密数据:
# 使用
def _use():
with open('ocr_dddd.json', 'r', encoding='utf-8') as f:
words_map = json.loads(f.read())
# 对应的接口
url = '链接就不放出来了。。。。。。'
response = requests.get(url)
content = response.content.decode('utf-8')
font = TTFont('e26e946d8b2ccb7-500.woff2')
cmap = font.getBestCmap()
txt = json.loads(content)
b = txt['data']['book_list'][0]['read_count']
words = []
for itm in b:
try:
d = cmap[ord(itm)]
word = words_map[d]
# print(word)
words.append(word)
except:
# d = -1
word = -1
words.append(itm.strip())
# print(itm, '--->', ord(itm), '----->: d ', d, 'word: ', word)
print(''.join(words))
打印结果: