import sys, fitz
import os
import datetime
def pyMuPDF_fitz(pdfPath, imagePath):
startTime_pdf2img = datetime.datetime.now() # 开始时间
print("imagePath=" + imagePath)
pdfDoc = fitz.open(pdfPath)
for pg in range(pdfDoc.pageCount):
page = pdfDoc[pg]
rotate = int(0)
# 每个尺寸的缩放系数为1.3,这将为我们生成分辨率提高2.6的图像。
# 此处若是不做设置,默认图片大小为:792X612, dpi=72
zoom_x = 1.33333333 # (1.33333333-->1056x816) (2-->1584x1224)
zoom_y = 1.33333333
mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
pix = page.getPixmap(matrix=mat, alpha=False)
if not os.path.exists(imagePath): # 判断存放图片的文件夹是否存在
os.makedirs(imagePath) # 若图片文件夹不存在就创建
pix.writePNG(imagePath + '/' + 'images_%s.png' % pg) # 将图片写入指定的文件夹内
endTime_pdf2img = datetime.datetime.now() # 结束时间
print('pdf2img时间=', (endTime_pdf2img - startTime_pdf2img).seconds)
if __name__ == "__main__":
pdfPath = '2.pdf' # pdf文件名
imagePath = 'book' # 提取出的pdf,转化的图片存放的文件夹名
pyMuPDF_fitz(pdfPath, imagePath)
def to_page(file_img):
image = Image.open(file_img)
image_arr = np.array(image)
for i in range(len(image_arr)):
t = 0
for j in range(len(image_arr[0])):
if not str(image_arr[i][j]) == '[255 255 255]':
t += 1
if t >= 300:
return True
if t != 0:
return False
def page(path):
path = list(path)
path[-5] = str(int(path[-5]) + 1)
path = ''.join(path)
if to_page(path):
results = OCR(path)
for re in results:
re[0][0][1] += 1100
re[0][1][1] += 1100
re[0][2][1] += 1100
re[0][3][1] += 1100
return results
return None
def to_page(file_img):
image = Image.open(file_img)
image_arr = np.array(image)
for i in range(len(image_arr)):
t = 0
for j in range(len(image_arr[0])):
if not str(image_arr[i][j]) == '[255 255 255]':
t += 1
if t >= 300:
return True
if t != 0:
return False
def page(path):
path = list(path)
path[-5] = str(int(path[-5])+1)
path = ''.join(path)
if to_page(path):
results = OCR(path)
for re in results:
re[0][0][1] += 1100
re[0][1][1] += 1100
re[0][2][1] += 1100
re[0][3][1] += 1100
return results
return None