CnOCR文本抽取

from cnocr import CnOcr
# import numpy as np
import fnmatch,os
# from PIL import Image
import re
from pdf2image import convert_from_path

ocr = CnOcr()


def pdf2img(PDF_file):
    # PDF 转为图片
    pages = convert_from_path(PDF_file, 200)
    # 获得pages:每个pdf的总页数
    image_counter = 1
    for page in pages:
        filename = "./img/page_" + str(image_counter) + ".png"
        page.save(filename, 'png')
        image_counter += 1
    return image_counter, PDF_file
#  return 26,H51001.pdf

def img2txt(image_counter, PDF_file):
    # 图片中提取文本
    filelimit = image_counter-1 # pdf页数的限制 H51001:26-1=25

    outfile = newfilename(PDF_file) # outfile = './OCR/H51001.txt'

    outfile = os.path.join('./OCR/', outfile)
    f = open(outfile, "a")
    for i in range(1, filelimit + 1):
        filename = "./img/page_"+str(i)+".png" # ./page_1.png
        text = ""
        for ls in ocr.ocr(filename):
            # text = text + ls[0]
            text = text+ls["text"]
            # text += '\n'
            # 获取内容
            # print(text)
        # print(text)
        # text = str((ocr.ocr(filename))[:][0]) # chi_sim 表示简体中文

        # 文本处理
        text = text.replace('\n', '') #处理掉全部换行
        text = text.replace(' ', '')
        text = text.replace('(', '')
        text = text.replace(')', '')
        text = re.sub('[a-zA-Z]','',text) #将英文数字全部替换
        # text = re.sub(r'\d\d','\n',text)
        text = text.replace('.','')
        text = text.replace('ü', '')
        text = text.replace('é', '')
        text = text.replace('à', '')

        print(text)
        f.write(text)
    f.close()
    print(outfile + ' over\n')

def newfilename(filePath,outfile=''):
    dirs,filename = os.path.split(filePath)
    # 2、修改切分后的文件后缀
    outfile = ""
    if fnmatch.fnmatch(filename,'*.pdf') or fnmatch.fnmatch(filename,'*PDF'):
        outfile = filename[:-4] + '.txt' # 更新文件后缀名
    return outfile


filePath = './pdf/'
# filelist = os.listdir(filePath)
filelist = ['H21003.pdf']
#['H51001.pdf','H51002.pdf',...]

for filename in filelist:
    PDF_file = os.path.join(filePath, filename)
    image_counter, PDF_file = pdf2img(PDF_file)
    img2txt(image_counter, PDF_file)


# cnocr包
# https://gitee.com/cyahua/cnocr?_from=gitee_search

# 要想使用pdf2image
# 要在Windows配置poppler
# https://stackoverflow.com/questions/18381713/how-to-install-poppler-on-windows

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值