CnOCR文本抽取

小汪算法打怪升级之路

已于 2024-02-26 20:48:32 修改

阅读量65

点赞数

文章标签： python 开发语言

于 2023-03-07 09:27:32 首次发布

本文链接：https://blog.csdn.net/qq_50499245/article/details/129375533

版权

from cnocr import CnOcr
# import numpy as np
import fnmatch,os
# from PIL import Image
import re
from pdf2image import convert_from_path

ocr = CnOcr()


def pdf2img(PDF_file):
    # PDF 转为图片
    pages = convert_from_path(PDF_file, 200)
    # 获得pages:每个pdf的总页数
    image_counter = 1
    for page in pages:
        filename = "./img/page_" + str(image_counter) + ".png"
        page.save(filename, 'png')
        image_counter += 1
    return image_counter, PDF_file
#  return 26,H51001.pdf

def img2txt(image_counter, PDF_file):
    # 图片中提取文本
    filelimit = image_counter-1 # pdf页数的限制 H51001:26-1=25

    outfile = newfilename(PDF_file) # outfile = './OCR/H51001.txt'

    outfile = os.path.join('./OCR/', outfile)
    f = open(outfile, "a")
    for i in range(1, filelimit + 1):
        filename = "./img/page_"+str(i)+".png" # ./page_1.png
        text = ""
        for ls in ocr.ocr(filename):
            # text = text + ls[0]
            text = text+ls["text"]
            # text += '\n'
            # 获取内容
            # print(text)
        # print(text)
        # text = str((ocr.ocr(filename))[:][0]) # chi_sim 表示简体中文

        # 文本处理
        text = text.replace('\n', '') #处理掉全部换行
        text = text.replace(' ', '')
        text = text.replace('（', '')
        text = text.replace('）', '')
        text = re.sub('[a-zA-Z]','',text) #将英文数字全部替换
        # text = re.sub(r'\d\d','\n',text)
        text = text.replace('.','')
        text = text.replace('ü', '')
        text = text.replace('é', '')
        text = text.replace('à', '')

        print(text)
        f.write(text)
    f.close()
    print(outfile + ' over\n')

def newfilename(filePath,outfile=''):
    dirs,filename = os.path.split(filePath)
    # 2、修改切分后的文件后缀
    outfile = ""
    if fnmatch.fnmatch(filename,'*.pdf') or fnmatch.fnmatch(filename,'*PDF'):
        outfile = filename[:-4] + '.txt' # 更新文件后缀名
    return outfile


filePath = './pdf/'
# filelist = os.listdir(filePath)
filelist = ['H21003.pdf']
#['H51001.pdf','H51002.pdf',...]

for filename in filelist:
    PDF_file = os.path.join(filePath, filename)
    image_counter, PDF_file = pdf2img(PDF_file)
    img2txt(image_counter, PDF_file)


# cnocr包
# https://gitee.com/cyahua/cnocr?_from=gitee_search

# 要想使用pdf2image
# 要在Windows配置poppler
# https://stackoverflow.com/questions/18381713/how-to-install-poppler-on-windows