在网上找了PPT提取表格并标记表格所在页码,居然没有发现开箱即用的代码,所幸自己又写了写,改了改.
实现目标:解析PPT中表格,并且对于同一页的多个表格可以分布识别,在表格输出命名上,标记出表格所在的PPT页码数.
废话不多,且看以下代码:
import pptx
from pptx import Presentation
import os
import sys
import csv
#提取表格
def extract_tables_from_pptx(pptx_file):
prs = Presentation(pptx_file)
slide_number = 1 # 用于跟踪当前页码
# 遍历PPT中的幻灯片
for i, slide in enumerate(prs.slides):
for shape in slide.shapes:
if shape.has_table:
table = shape.table
table_ls = []
# 读取表格内容
for row in table.rows:
row_ls = []
for cell in row.cells:
row_ls.append(cell.text)
table_ls.append(row_ls)
# 创建CSV文件名
csv_filename = f"table{slide_number}_{i}.csv"
path = "./output/table/知识蒸馏分享/"
if not os.path.exists(path):
os.makedirs(path)
tablefile = os.path.join(path, csv_filename)
# 写入表格内容到CSV文件
with open(tablefile, 'w', newline='', encoding='utf-8') as csvfile:
csvwriter = csv.writer(csvfile)
for row in table_ls:
csvwriter.writerow(row)
slide_number += 1
if __name__ == '__main__':
# for fn in (fns for fns in os.listdir(ppt_root) if fns.endswith(('.ppt', 'pptx'))):
filepath = './data/知识蒸馏分享.pptx'
extract_tables_from_pptx(filepath)
喜欢的化就点个赞吧