python pdf转Excel

最新推荐文章于 2024-08-14 22:21:23 发布

宇宙全能王

最新推荐文章于 2024-08-14 22:21:23 发布

阅读量1.8k

点赞数 1

分类专栏： python excel pandas 文章标签： pdf

本文链接：https://blog.csdn.net/chang1976272446/article/details/107854838

版权

python 同时被 3 个专栏收录

114 篇文章 3 订阅

订阅专栏

excel

23 篇文章 0 订阅

订阅专栏

pandas

15 篇文章 0 订阅

订阅专栏

源文件为test.pdf
目标文件为pdf.xlsx


import pdfplumber  # 关键在这个库
import pandas as pd


def func(src, dest='pdf.xlsx'):
    pdf = pdfplumber.open(src)
    size = len(pdf.pages)  #pdf有多少页

    with pd.ExcelWriter(dest) as writer:# 多页表格内容写入一个Excel
        count = 0
        j = 0 
        is_start = False
        for i in range(size):
            print('reading page %d' % i)
            page = pdf.pages[i]
            content = page.extract_text()
            # 这段代码是为了匹配表开始的地方
            if not is_start:
            	# 判断表开始的文字
                if content.lstrip().startswith('附表：网下投资者初步配售明细'):
                    is_start = True
                else:
                    continue
            # 提取表格，非表格就跳过
            try:
                table = page.extract_table()
            except:
                continue
            # 表格内容会转化为dataframe
            df = pd.DataFrame(table)
            #避免出现多个dataframe的表头
            if count == 0:
                df.to_excel(writer, header=False, index=False, startrow=count)
            else:
                df[1:].to_excel(writer, header=False, index=False, startrow=count - j)
                j += 1  # 避免出现空行

            count += len(df)
            pass

        writer.save()
        writer.close()
        pass


if __name__ == '__main__':
    func('test.PDF')