使用情景
当我们爬小说的时候,往往会遇到字体加密,比如番茄小说就有woff2,常规方法比较棘手,我们就采用字体识别,也得感谢ddddocr这个库的开源
咱也废话少说,直接看代码
代码
import json from PIL import ImageFont, Image, ImageDraw from io import BytesIO import ddddocr, base64 from fontTools.ttLib import TTFont from fontTools.ttLib.woff2 import decompressclass decode_func: def __init__(self,filename): if filename.split('.')[-1]=='woff2': decompress(filename, ''.join(filename.split('.')[:-1])+'.ttf') filename=''.join(filename.split('.')[:-1])+'.ttf' self.filename = filename elif filename.split('.')[-1]=='ttf': self.filename=filename elif filename.split('.')[-1]=='woff': woff_file = filename ttf_file = ''.join(filename.split('.')[:-1])+'.ttf' woff_font = TTFont(woff_file, recalcBBoxes=False, recalcTimestamp=False) # 保存为 TTF 字体 woff_font.save(ttf_file) self.filename=ttf_file else: raise Exception('文件格式错误') self.my_dict={} self.ocr=ddddocr.DdddOcr() self.font = TTFont(filename) def font_to_img(self,txt): img_size = 1024 img = Image.new('1', (img_size, img_size), 255) draw = ImageDraw.Draw(img) font = ImageFont.truetype(self.filename, int(img_size * 0.7)) txt = chr(txt) bbox = draw.textbbox((0, 0), txt, font=font) x = bbox[2] - bbox[0] y = bbox[3] - bbox[1] draw.text(((img_size - x) // 2, (img_size - y) // 2), txt, font=font, fill=0) return img def font_analysis(self): ''' 传入字体文件名称就能直接出来映射对照 :param filename: :return: ''' f = TTFont(self.filename) ocr = ddddocr.DdddOcr() for i, Glyphname in f.getBestCmap().items(): pil = self.font_to_img(i) bytes_io = BytesIO() pil.save(bytes_io, format="PNG") res = ocr.classification(bytes_io.getvalue()) self.my_dict[i]=res def get_dict(self): self.font_analysis() with open('my_dict.json', 'w', encoding='utf-8') as f: f.write(json.dumps(self.my_dict, ensure_ascii=False))my_p=decode_func('y6.ttf') #这里写你的加密文件名,woff,woff2,ttf都行 my_p.get_dict()
这个代码能够借助的是图像转换,主要解决woff2的只有图像与字体对应问题,将json结果用文件导出,使用了ddddocr图像识别,但是识别率肯定不是100%,可能要自己手动微调,比如几会带个l,0会识别为o等等
成功后获得
有用的话给个赞加关注支持一下呗