公众号:爬虫与地理信息
一、PyMuPDF库介绍
PyMuPDF库官方文档点这里,主页如下图所示,里面详细介绍了怎么提取照片、文本、表格等主要功能。
二、核心代码
1.PDF逐页保存图片
将PDF依次按页保存为照片
def pdf2img(pdfPath, imgPath):
pdfDoc = fitz.open(pdfPath)
print("pdf文件共 {} 页".format(pdfDoc.pageCount))
for i in range(pdfDoc.pageCount):
page = pdfDoc[i]
rotate = int(0)
#应用每个维度2的缩放系数,这将为我们生成分辨率提高四倍的图像(尺寸也大约是尺寸的4倍):
zoom_x = 2 # horizontal zoom
zoom_y = 2 # vertical zoom
mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
pix = page.getPixmap(matrix = mat, alpha = False)
if not os.path.exists(imgPath):
os.makedirs(imgPath)
pix.writePNG(imgPath + '/' + 'img_%s.png' % i)
#pix.writeImage(imgPath + '/' + 'page-%i.png' % i)
2.照片转PDF
将许多张照片转换成一个PDF文件。
def img2pdf(imgdir, pdfname):
doc = fitz.open() # PDF with the pictures
imglist = os.listdir(imgdir) # list of them
for i, f in enumerate(imglist):
img = fitz.open(os.path.join(imgdir, f)) # open pic as document
print(img)
rect = img[0].rect # pic dimension
pdfbytes = img.convert_to_pdf() # make a PDF stream
img.close() # no longer needed
imgPDF = fitz.open("pdf", pdfbytes) # open stream as PDF
page = doc.new_page(width=rect.width, # new page with ...
height=rect.height) # pic dimension
page.show_pdf_page(rect, imgPDF, 0) # image fills the page
doc.save(pdfname)
3.PDF转Txt与Word文件
将pdf文件中的汉字提取出来,并保存成txt文件或word文件。
def pdf2text(fname):
#创建word文档对象
worddoc = docx.Document()
doc = fitz.open(fname) # open document
name = fname.split('.')[0]
out = open(name + ".txt", "wb") # open text output
for page in doc: # iterate the document pages
text = page.get_text("text").encode("utf-8", errors='ignore') # get plain text (is in UTF-8)
out.write(text) # write text of page
out.write(bytes((12,))) # write page delimiter (form feed 0x0C)
#print(text)
texts= text.decode('utf-8').split('\t\n')
for line in texts:
worddoc.add_paragraph(line.replace("\n", ""))
#保存word文档对象
fname = name + '.docx'
worddoc.save(fname)
out.close()
4.PDF提取图片
def getImginPDF(fname):
doc = fitz.open(fname) # open document
for page in range(len(doc)): # iterate the document pages
#for image in doc.getPageImageList(page):
for image in doc.get_page_images(page, full=True):
#print(type(image)) #image为元组类型
xref = image[0]
pix = fitz.Pixmap(doc, xref)
if pix.n < 5: # this is GRAY or RGB
pix.writePNG("page%s.png" % (page))
else: # CMYK: convert to RGB first
pix1 = fitz.Pixmap(fitz.csRGB, pix)
pix1.writePNG("page%s.png" % (page))
pix1 = None
pix = None