python pdfplumber读取PDF指定表格内容批量文件重命名改良版

最新推荐文章于 2024-03-10 18:33:44 发布

Wilburzzz

最新推荐文章于 2024-03-10 18:33:44 发布

阅读量1.1k

点赞数

分类专栏： python小程序代码文章标签： python

本文链接：https://blog.csdn.net/Wilburzzz/article/details/109531771

版权

python小程序代码专栏收录该内容

3 篇文章 0 订阅

订阅专栏

后面帮大学同学写的需求改良版，需求改为了获得PDF表格中某一行的姓名跟另外一行的姓名，都是在第一页，解决思路是设置表达式规则然后不断尝试获取规则的文字，如果空值继续进入下一个规则，因为笔者的PDF文件有些姓名同样出现不会超过7次，所以在重命名是设置了8次重命名尝试，由于PDF文件有重要敏感信息，所以只写如下代码提供个观看或者思路，PDF就不提供了，看看有帮助就好，没有我也没办法，我主攻的方向目前不是这个。

import sys
sys.setrecursionlimit(5000)  # 使用pyinstaller打包成EXE格式出错解决语句之一
import os
import re
import pdfplumber
# 设置初始目录
file_dir = 'D:\你的文件夹'
file_list = []  # 设置空列表用来接收文件夹下的文件名称
os.chdir('D:\你的文件夹')
for files in os.walk(file_dir):
    count = 1
    for file in files[2]:
        file_list.append(file)
    new_filename = []  # 设置空列表用来接收存放你截取的表格内容文字
    for i in range(len(file_list)):  # 循环文件夹的文件
        pdf = pdfplumber.open(file_list[i])  # 打开循环到的文件
        pages = pdf.pages
        page = pages[0]  # 获取第一页的内容，要获取第二页就改成1，一次类推
        tables = page.extract_text()  # 解析表格内容成文字
        # print(tables)
        pdf.close()

        n_soruce = '1(.*?)男 居民'  # 利用正则表达式获取你想要的内容
        nsource = re.findall(n_soruce, tables)
        if nsource == []:
            n_soruce = '1(.*?)男 残疾'
        nsource = re.findall(n_soruce, tables)
        if nsource == []:
            n_soruce = '1(.*?)男居民'
        nsource = re.findall(n_soruce, tables)
        if nsource == []:
            n_soruce = '1(.*?)男残疾'
        nsource = re.findall(n_soruce, tables)

        if nsource == []:  # 因为名字后面的性别有男有女，所以有可能获取到空值，还有一种情况就是有些文字内容可能会跟你的表达式吻合，所以要多加几个表达式，根据时间情况调整
            n_soruce = '1(.*?)女 居民'
        nsource = re.findall(n_soruce, tables)
        if nsource == []:
            n_soruce = '1(.*?)女 残疾'
        nsource = re.findall(n_soruce, tables)

        if nsource == []:  # 因为名字后面的性别有男有女，所以有可能获取到空值，还有一种情况就是有些文字内容可能会跟你的表达式吻合，所以要多加几个表达式，根据时间情况调整
            n_soruce = '1(.*?)女居民'
        nsource = re.findall(n_soruce, tables)
        if nsource == []:
            n_soruce = '1(.*?)女残疾'
        nsource = re.findall(n_soruce, tables)
        nsource = str(eval(str(nsource).strip('['', '']')))

        if '20201231' in tables:
            p_soruce1 = '1(.*?)20201231'  # 利用正则表达式获取你想要的内容
            source = re.findall(p_soruce1, tables)
            if source != []:
                try:
                    if len(source[0]) < 7:
                        p_soruce2 = '2(.*?)20201231'
                        source = re.findall(p_soruce2, tables)
                    if len(source[0]) == 8:
                        p_soruce3 = '3(.*?)20201231'
                        source = re.findall(p_soruce3, tables)
                    if source == []:
                        p_soruce4 = '4(.*?)20201231'
                        source = re.findall(p_soruce4, tables)
                except Exception as n1:
                    source = source

            source_str = str(source)
            source_name = source_str[2:6]
            source_name = source_name + '--' + nsource
            new_filename.append(source_name)
            pdf.close()

            try:
                os.rename(str(file_list[i]), (str(new_filename[i])) + '.pdf')
            except Exception as e:  # 抓住所有错误,一般放在最后
                try:
                    os.rename(str(file_list[i]), (str(new_filename[i])) + str(2) + '.pdf')
                except Exception as e1:
                    try:
                        os.rename(str(file_list[i]), (str(new_filename[i])) + str(3) + '.pdf')
                    except Exception as e2:
                        try:
                            os.rename(str(file_list[i]), (str(new_filename[i])) + str(4) + '.pdf')
                        except Exception as e3:
                            try:
                                os.rename(str(file_list[i]), (str(new_filename[i])) + str(5) + '.pdf')
                            except Exception as e4:
                                try:
                                    os.rename(str(file_list[i]), (str(new_filename[i])) + str(6) + '.pdf')
                                except Exception as e5:
                                    try:
                                        os.rename(str(file_list[i]), (str(new_filename[i])) + str(7) + '.pdf')
                                    except Exception as e6:
                                        os.rename(str(file_list[i]), (str(new_filename[i])) + str(8) + '.pdf')
            print('原文件名：%s 重命名为:%s' % (str(file_list[i]), new_filename[i]))
            pdf.close()
        elif not '20201231' in tables:
            source_name = '无人帮扶' + str(count) + '--' + nsource
            new_filename.append(source_name)
            pdf.close()
            os.rename(str(file_list[i]), (str(new_filename[i])) + '.pdf')
            print('原文件名：%s 重命名为:%s' % (str(file_list[i]), (str(new_filename[i])) + '.pdf'))
            count += 1

Wilburzzz

关注

0
点赞
踩
10

收藏

觉得还不错? 一键收藏
打赏
3
评论
python pdfplumber读取PDF指定表格内容批量文件重命名改良版

后面帮大学同学写的需求改良版import syssys.setrecursionlimit(5000) # 使用pyinstaller打包成EXE格式出错解决语句之一import osimport reimport pdfplumber# 设置初始目录file_dir = 'D:\贫困户信息对照表（2020年）(1)'file_list = [] # 设置空列表用来接收文件夹下的文件名称os.chdir('D:\贫困户信息对照表（2020年）(1)')for files in os.
复制链接

扫一扫