python pdfplumber读取PDF指定表格内容批量文件重命名

最新推荐文章于 2024-08-09 04:15:09 发布

Wilburzzz

最新推荐文章于 2024-08-09 04:15:09 发布

阅读量1.6k

点赞数 1

分类专栏： python小程序代码文章标签： python 开发工具数据分析

本文链接：https://blog.csdn.net/Wilburzzz/article/details/109452082

版权

python小程序代码专栏收录该内容

3 篇文章 0 订阅

订阅专栏

帮以前大学写的一个小程序代码，，写的时间比较急，有点乱，一天的作品，给有兴趣的人看下，希望起到抛砖引玉的作用：

import sys
sys.setrecursionlimit(5000)  # 使用pyinstaller打包成EXE格式出错解决语句之一
import os
import re
import pdfplumber
# 设置初始目录
file_dir = 'D:\你的文件夹'
file_list = []  # 设置空列表用来接收文件夹下的文件名称
os.chdir('D:\你的文件夹')
for files in os.walk(file_dir):
    for file in files[2]:
        file_list.append(file)
        # print(file)
    new_filename = []  # 设置空列表用来接收存放你截取的表格内容文字
    for i in range(len(file_list)):  # 循环文件夹的文件
        pdf = pdfplumber.open(file_list[i])  # 打开循环到的文件
        pages = pdf.pages
        page = pages[0]  # 获取第一页的内容，要获取第二页就改成1，一次类推
        tables = page.extract_text()  # 解析表格内容成文字
        # print(tables)
        p_soruce = '1(.*?)男'  # 利用正则表达式获取你想要的内容
        source = re.findall(p_soruce, tables)
        if source == []:  # 因为名字后面的性别有男有女，所以有可能获取到空值，还有一种情况就是有些文字内容可能会跟你的表达式吻合，所以要多加几个表达式，根据时间情况调整
            p_soruce = '1(.*?)女 居民'
        source = re.findall(p_soruce, tables)
        if source == []:
            p_soruce = '1(.*?)女 残疾'
        source = re.findall(p_soruce, tables)
        if source == []:
            p_soruce = '1(.*?)女'
        source = re.findall(p_soruce, tables)

        new_filename.append(source)
        # source_str = str(source)
        print(new_filename[i])
        pdf.close()
        try:  # 因为有可能有第二次出现名字一样的情况，所以用这个方法暂时顶替，多次出现就不要用这种方法了，暂时我也没更深入去从根源解决
            os.rename(str(file_list[i]), str(eval(str(new_filename[i]).strip('['', '']'))) + '.pdf')
        except Exception as e:
            os.rename(str(file_list[i]), str(eval(str(new_filename[i]).strip('['', '']'))) + str(2) + '.pdf')
        continue

        pdf.close()