Word文档中有大量表格,需要将表格转移到Excel中,使用Python中的pandas和docx库可以实现批量操作
转为
from docx import Document
import pandas as pd
def get_columns(document, name_0, num_0, name_1, num_1) -> list:
'''
获取表的列名
输入两个列名作为筛选表格的条件
document: docx文档
name_0: num_0列的名称
num_0: name_0列的索引,从0开始
name_1: num_1列的名称
num_1: name_1列的索引,从0开始
'''
columns = [] # 作为Excel的列名
for table in document.tables:
if table.cell(0,num_0).text == name_0 and table.cell(0,num_1).text == name_1:
for cell in table.rows[0].cells:
name = cell.text.replace('\n', '').replace('\r', '') # 去除换行符
name = name.strip() # 去除两边的空格
columns.append(name)
break
return columns
def doc2excel(document, excel_df, name_0, num_0, name_1, num_1) -> pd.DataFrame:
'''
将word中的表格批量转为1个Excel文件
输入两个列名作为筛选表格的条件
excel_df: 要写入的dataframe
document: docx文档
name_0: num_0列的名称
num_0: name_0列的索引,从0开始
name_1: num_1列的名称
num_1: name_1列的索引,从0开始
'''
# 向表格中写入数据
for table in document.tables:
# 找到需要的表格
if table.cell(0,num_0).text == name_0 and table.cell(0,num_1).text == name_1:
for i, row in enumerate(table.rows):
if i==0:
continue
data = []
for cell in row.cells:
t = cell.text.replace('\n', '').replace('\r', '') # 去除换行符
t = t.strip() # 去除两边的空格
data.append(t)
# 逐行添加数据
new_row = pd.Series(data=data, index=excel_df.columns)
excel_df = excel_df._append(new_row, ignore_index=True)
print(excel_df.head())
return excel_df
if __name__ == "__main__":
path = r"C:\Users\13258\Desktop\test\新建 Microsoft Word 文档.docx"
save_path = r'C:\Users\13258\Desktop\test\t.xlsx'
# 使用两个列名作为不同表格的区分条件
name_0 = 'a'
num_0 = 0
name_1 = 'b'
num_1 = 1
document = Document(path)
excel_df = pd.DataFrame(columns=get_columns(document, name_0, num_0, name_1, num_1))
excel_df = doc2excel(document, excel_df, name_0, num_0, name_1, num_1)
excel_df.to_excel(save_path, index=False)
输出