本文的方法主要实现批处理pdf2txt。强推方法二!!!
方法一:使用pdfminer3k
参考来自GitHub的代码。
######################################
# tesseract OCR
from PIL import Image
import pytesseract
def img_to_str_tesseract(image_path, lang='chi_sim'):
return pytesseract.image_to_string(Image.open(image_path), lang)
######################################
# 百度 OCR
from aip import AipOcr
config = {
'appId': '',
'apiKey': '',
'secretKey': ''
}
client = AipOcr(**config)
def img_to_str_baidu(image_path):
with open(image_path, 'rb') as fp:
image = fp.read()
result = client.basicGeneral(image)
if 'words_result' in result:
return '\n'.join([w['words'] for w in result['words_result']])
return ""
######################################
# 解析PDF文件
from pdfminer.pdftypes import LITERALS_DCT_DECODE, LITERALS_FLATE_DECODE
from pdfminer.pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
from pdfminer.pdfparser import PDFParser,PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTChar, LTTextLine
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
import os
import sys
import numpy as np
import importlib
importlib.reload(sys)
TMPDIR = 'tmp/'
PARSEIMG = True
OCR_ONLINE = False
# 保存图片
def write_image(image, outdir):
stream = image.stream
filters = stream.get_filters()
if len(filters) == 1 and filters[0] in LITERALS_DCT_DECODE:
ext = '.jpg'
data = stream.get_rawdata()
elif image.colorspace is LITERAL_DEVICE_RGB:
ext = '.bmp'
data = create_bmp(stream.get_data(), stream.bits*3, image.width, image.height)
elif image.colorspace is LITERAL_DEVICE_GRAY:
ext = '.bmp'
data = create_bmp(stream.get_data(), stream.bits, image.width, image.height)
else:
ext = '.img'
data = stream.get_data()
name = image.name+ext
path = os.path.join(outdir, name)
fp = open(path, 'wb')
fp.write(data)
fp.close()
return path, len(data)
# 写入文件
def write_file(path, text, ftype, debug=False):
with open(path, ftype) as f:
if debug:
print("write", len(text))
f.write(text)
# 去掉文中多余的回车
def adjust(inpath, outpath):
f = open(inpath)
lines = f.readlines()
arr = [len(line) for line in lines]
length = np.median(arr) # 行字符数中值
string = ""
for line in lines:
if len(line) >= length and line[-1]=='\n':
string += line[:-1] # 去掉句尾的回车
elif line == '-----------\n':
pass
else:
string += line
write_file(outpath, string, 'w')
return
# 解析每个数据块
def parse_section(layout, outpath, debug = False):
for x in layout:
if (isinstance(x, LTTextBoxHorizontal)): # 文本
write_file(outpath, x.get_text(), 'a')
elif (isinstance(x, LTFigure)):
parse_section(x, outpath)
elif (isinstance(x, LTImage)) and PARSEIMG: # 图片
path,length = write_image(x, TMPDIR)
if length > 0:
if OCR_ONLINE:
write_file(outpath, img_to_str_baidu(path), 'a')
else:
write_file(outpath, img_to_str_tesseract(path), 'a')
write_file(outpath, '\n' + '-----------' + '\n', 'a')
# 删除文件
def remove(path):
if not os.path.exists(path):
return
if os.path.isfile(path):
os.remove(path)
return
dirs = os.listdir(path)
for f in dirs:
file_name = os.path.join(path, f)
if os.path.isfile(file_name):
os.remove(file_name)
else:
remove(file_name)
os.rmdir(path)
# 解析PDF文件
def parse(inpath, outpath):
remove(TMPDIR) # 清除临时目录
os.mkdir(TMPDIR)
remove(outpath) # 清除输出文件
fp = open(inpath, 'rb')
praser = PDFParser(fp) # pdf文档分析器
doc = PDFDocument(praser)# 创建一个PDF文档
praser.set_document(doc) # 连接分析器与文档对象
doc.set_parser(praser)
doc.initialize("")
if not doc.is_extractable: # 是否提供txt转换
raise PDFTextExtractionNotAllowed
else:
rsrcmgr = PDFResourceManager() # 创建PDF资源管理器
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device) # 创建PDF解释器对象
for idx,page in enumerate(doc.get_pages()): # 获取page列表
interpreter.process_page(page)
layout = device.get_result()
print("parse", idx)
parse_section(layout, outpath)
if __name__ == '__main__':
#批处理
target_path = sys.argv[1]
for base_path,folder_list,file_list in os.walk(target_path):
for file_name in file_list:
file_path = file_name
if file_path[-3:] != 'pdf':
# 不是pdf文件
continue
try:
pdffile = file_path
tmpfile = pdffile.replace('pdf','tmp')
txtfile = pdffile.replace('pdf','txt')
parse(pdffile, tmpfile)
adjust(tmpfile, txtfile)
remove(tmpfile) #删除tmp
except Exception as e:
print(file_name," error!")
笔者根据自己的需要进行过调整。
!坑
一定不要安装pdfminer,是pdfminer3k啊大兄弟。如果不幸安装了,可以unstall pdfminer,pdfminer3k,然后再install pdfminer3k,根据提示把所有已经存在的包删掉,再install pdfminer3k
跪求大神解答为何pdfminer和pdfminer3k会导包混乱
方法二:借助xpdf
参考自知乎,根据自己的需要和pdfminer3k代码进行优化:
import numpy as np
import os
import subprocess
from os.path import isfile,join
ef = r'./xpdf/pdftotext.exe'
cfg = r'./xpdf/xpdfrc'
def convert(file_name_pdf):
file_name_pdf = join(r'./resourses',file_name_pdf)
bo = subprocess.check_output([ef,'-f','1','-l','1000','-cfg',cfg,'-raw',file_name_pdf,'-']) #这个命令中的所有调用文件参数必须使用full path.否则调用出错。
return bo.decode('utf-8')
def write_file(bo,file_name,method="wb"):
file_name = join(r'./results/',file_name)
with open(file_name,method) as f:
f.write(bo)
# 去除换行
def adjust(inpath, outpath):
inpath = join(r'./results/',inpath)
f = open(inpath,encoding='utf-8')
lines = f.readlines()
arr = [len(line) for line in lines]
length = np.median(arr) # 行字符数中值
string = ""
for line in lines:
if len(line) >= length and line[-1]=='\n':
string += line[:-1] # 去掉句尾的回车
elif line == '-----------\n':
pass
else:
string += line
string=string.encode('utf-8')
write_file(string, outpath)
def rm(inpath):
inpath = join(r'./results/',inpath)
os.remove(inpath)
if __name__ == '__main__':
#批处理
su_count = 0
er_count = 0
count = 0
target_path = r'./resourses'
for base_path,folder_list,file_list in os.walk(target_path):
for file_name in file_list:
if file_name[-3:] != 'pdf':
# 不是pdf文件
continue
try:
pdffile = file_name
tmpfile = pdffile.replace('pdf','tmp')
txtfile = pdffile.replace('pdf','txt')
bo = convert(pdffile).encode('utf-8')
write_file(bo,tmpfile)
adjust(tmpfile, txtfile)
rm(tmpfile)
su_count += 1
count += 1
print(count,"-->",file_name," success!\n ")
except Exception as e:
er_count += 1
count += 1
print(count,"-->",file_name," error!\n ")
print("\ncount: ",count,"\n","success: ",su_count,"\n","error: ",er_count)
感谢乐于在网络上分享的大神们,终于不用一篇一篇转了!
链接在这:
链接:https://pan.baidu.com/s/1QW3XMAvf8qJlaLHxmUBEXg
提取码:tw95
使用的小伙伴记得提前看README.md,我踩过的坑你们一定不能再踩了,挥泪~~