python提取PDF文件

小木可菜鸟测试一枚

已于 2022-12-12 22:07:58 修改

阅读量1.4k

点赞数 1

分类专栏： python 文章标签： python 开发语言

于 2022-06-26 22:46:58 首次发布

本文链接：https://blog.csdn.net/m0_51709670/article/details/125475687

版权

本文介绍了如何利用Python的pdfminer库读取PDF文件，并详细阐述了读取PDF中的坐标和文本的运行机制。

摘要由CSDN通过智能技术生成

一、1

import os
import pdfplumber
from common.log import Log
from common.data_process import FileOperate
from common.config_reader import ConfigReader
import datetime

class BaseMethod:

    def __init__(self):
        self.log = Log()
        self.file_path = ConfigReader().get_value("file", "Case_file_path")
        self.fileoperate = FileOperate()
        # self.runstatus_file_path = ConfigReader().get_value("file", "runstatus_file_path")

    def open_path(self):
        # 获取pdf文件路径
        (self.dirname, self.filename) = os.path.split(self.file_path)
        (self.file, extension) = os.path.splitext(self.filename)
        if self.dirname == "":
            return -1
        elif self.filename == "":
            return -1
        else:
            return 0

    def as_name(self):
        # pdf 提取信息后另存为的路径
        if not os.path.exists(self.dirname):
            os.mkdir(self.dirname)
        timestr = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
        self.savefile = os.path.join(self.dirname, self.file+'-'+timestr)
        try:
            if len(self.savefile) <= 100:
                return self.savefile
        except:
            self.log.logMsg(2, 'Failed to get file')
            return None

    def pages(self):
        pages = pdfplumber.open(self.file_path).pages
        for i in range(len(pages)):
            page = pages[i]
            return page

    def as_txt_file(self):
        # 读取pdf文件，写入txt文件
        txt_file = self.as_name()
        try:
            table = self.pages().extract_text()
            self.fileoperate.writefile(txt_file + ".txt", "txt", table