关于PDF文本的解析与PDF图片的提取

Glyann

已于 2024-05-07 12:20:04 修改

阅读量2.5k

点赞数 2

分类专栏： python 文章标签： linq microsoft c#

于 2020-03-03 10:45:58 首次发布

本文链接：https://blog.csdn.net/Gzigithub/article/details/104626953

版权

python 专栏收录该内容

21 篇文章

订阅专栏

1. 利用python读取PDF文本内容

一，问题描述

　　利用python读取PDF文本内容

二，运行环境

　　python 3.6

三，需要安装的库

pip install pdfminer

对pdfminer的简单介绍，官网介绍如下：

　　PDFMiner is a tool for extracting information from PDF documents. Unlike other PDF-related tools, it focuses entirely on getting and analyzing text data. PDFMiner allows to obtain the exact location of texts in a page, as well as other information such as fonts or lines. It includes a PDF converter that can transform PDF files into other text formats (such as HTML). It has an extensible PDF parser that can be used for other purposes instead of text analysis.

翻译是这样的：

PDFMiner是一个从PDF文档中提取信息的工具。与其他pdf相关的

工具不同，它完全专注于获取和分析文本数据。PDFMiner允许获取

页面中文本的确切位置，以及其他信息，比如字体或行。它包括一

个PDF转换器，可以将PDF文件转换成其他文本格式(如HTML)。

它有一个可扩展的PDF解析器，可以用于其他目的而不是文本分析。

参考链接：深入学习python解析并读取PDF文件内容的方法 - 战争热诚 - 博客园

2. 利用python提取PDF图片

一，问题描述

　　利用python获取PDF图片

二，运行环境

　　python 3.6

三，需要安装的库

pip install pymupdf

参考链接：Python提取PDF中的图片_Jayce~的博客-CSDN博客_python提取pdf中的图片

3. 实现源代码

# -*- coding:utf-8 -*-
import fitz
import time,os.path,re
time1=time.time()
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams,LTTextBoxHorizontal
from pdfminer.pdfpage import PDFTextExtractionNotAllowed,PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument



class CPdf2TxtManager():
    # 获取文本内容
    def changePdfToText(self, filePath):
        # 以二进制读模式打开
        file = open(path, 'rb')
        # 用文件对象来创建一个pdf文档分析器
        parser = PDFParser(file)
        # 创建一个PDF文档对象存储文档结构,提供密码初始化,没有就不用传该参数
        doc = PDFDocument(parser, password='')
        # 检查文件是否允许文本提取
        if not doc.is_extractable:
            raise PDFTextExtractionNotAllowed
        # 创建PDf资源管理器来管理共享资源，#caching = False不缓存
        rsrcmgr = PDFResourceManager(caching = False)
        # 创建一个PDF设备对象
        laparams = LAParams()
        # 创建一个PDF页面聚合对象
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解析器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # 获得文档的目录(纲要),文档没有纲要会报错
        # PDF文档没有目录时会报：raise PDFNoOutlines pdfminer.pdfdocument.PDFNoOutlines
        # print(doc.get_outlines())
        # 获取page列表
        # print(PDFPage.get_pages(doc))
        # 循环遍历列表，每次处理一个page的内容
        _data =""
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
        # 接受该页面的LTPage对象
        layout = device.get_result()
        # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象
        # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal等等
        for x in layout:
            if hasattr(x, "get_text"):
                fileNames = os.path.splitext(filePath)
                with open(fileNames[0] + '.txt','a+') as f:
                    results = x.get_text()
                    f.write(results + '\n')
                    _data+=results
        # # 如果x是水平文本对象的话
        # if (isinstance(x, LTTextBoxHorizontal)):
        # # print('kkk')
        # text = re.sub(" ", '', x.get_text())
        # if len(text) != 0:
        # _data+=text
        return _data
    
    
    
    # 获取图片
    def pdf2pic(self, path, pic_path):
        t0 = time.clock() # 生成图片初始时间
        checkXO = r"/Type(?= */XObject)" # 使用正则表达式来查找图片
        checkIM = r"/Subtype(?= */Image)"
        doc = fitz.open(path) # 打开pdf文件
        imgcount = 0 # 图片计数
        lenXREF = doc._getXrefLength() # 获取对象数量长度
        # 打印PDF的信息
        print("文件名:{}, 页数: {}, 对象: {}".format(path, len(doc), lenXREF - 1))
        # 遍历每一个对象
        for i in range(1, lenXREF):
            text = doc._getXrefString(i) # 定义对象字符串
            isXObject = re.search(checkXO, text) # 使用正则表达式查看是否是对象
            isImage = re.search(checkIM, text) # 使用正则表达式查看是否是图片
            if not isXObject or not isImage: # 如果不是对象也不是图片，则continue
                continue
            imgcount += 1
            pix = fitz.Pixmap(doc, i) # 生成图像对象
            new_name = "图片{}.png".format(imgcount) # 生成图片的名称
            if pix.n < 5: # 如果pix.n<5,可以直接存为PNG
                pix.writePNG(os.path.join(pic_path, new_name))
            else: # 否则先转换CMYK
                pix0 = fitz.Pixmap(fitz.csRGB, pix)
                pix0.writePNG(os.path.join(pic_path, new_name))
                pix0 = None
                pix = None # 释放资源
                t1 = time.clock() # 图片完成时间
                print("运行时间:{}s".format(t1 - t0))
                print("提取了{}张图片".format(imgcount))
    
    
    
if __name__ == '__main__':
    path = r'C:/Users/Administrator/Desktop/11.pdf'
    pdf2TxtManager = CPdf2TxtManager()
    df = pdf2TxtManager.changePdfToText(path)
    print(df)
    if not df:
        pic_path = r'C:/Users/Administrator/Desktop/pic'
    # 创建保存图片的文件夹
    if os.path.exists(pic_path):
        print("文件夹已存在，不必重新创建！")
        pass
    else:
        os.mkdir(pic_path)
    df = pdf2TxtManager.pdf2pic(path,pic_path)
    time2 = time.time()
    print('ok,解析pdf结束!')
    print('总共耗时：' + str(time2 - time1) + 's')