import os
import win32com.client as win32
import pandas as pd
excel_writer = pd.ExcelWriter('all_tables.xlsx', engine='xlsxwriter')
workbook = excel_writer.book
word_documents_folder = r"C:\Users\admin\Desktop\word\sum\sum" # 替换为包含Word文档的文件夹路径
combined_data = pd.DataFrame()
for filename in os.listdir(word_documents_folder):
if filename.endswith(".doc"):
word = win32.gencache.EnsureDispatch("Word.Application")
doc = word.Documents.Open(os.path.join(word_documents_folder, filename))
file_name = os.path.splitext(filename)[0]
for i, table in enumerate(doc.Tables):
# 初始化一个空的DataFrame,用于存储当前表格数据
table_data = {'File Name': [], 'Table Data': []}
skip_first_row = False
for row in table.Rows:
if skip_first_row:
skip_first_row = False
continue
table_data['File Name'].append(file_name)
row_data = []
for cell in row.Cells:
cell_text = ''.join(filter(lambda x: x.isprintable(), cell.Range.Text))
row_data.append(cell_text.strip())
table_data['Table Data'].append(row_data)
combined_data = combined_data.append(pd.DataFrame(table_data))
doc.Close()
word.Quit()
if 'Combined_Table' in workbook.sheetnames:
worksheet = workbook.get_worksheet_by_name('Combined_Table')
start_row = worksheet.dim_rowmax + 1 if worksheet.dim_rowmax is not None else 1
combined_data.to_excel(excel_writer, sheet_name="Combined_Table", startrow=start_row, header=False, index=False)
else:
combined_data.to_excel(excel_writer, sheet_name="Combined_Table", index=False)
excel_writer.save()
# 提取的数据存储在combined_data的一个列中,将其拆分成多列
split_data = combined_data['Table Data'].apply(pd.Series)
result = pd.concat([combined_data['File Name'], split_data], axis=1)
result.to_excel('split_table.xlsx', index=False)