一个基础的代码演示(对于合并的单元格, 思路来源python docx处理word文档中表格合并问题), 可以读取Word文档中的所有表格/标题, 并在Excel中创建相应的sheet
原先的想法是可以直接打开一次execl然后一次性写入, 网上搜了一圈没有啥好办法, 只能采用每次一个新表都重新打开文件的形式. 所以如果是重新生成相同文件名需要删除原有的文件(处理在save_sheet).
如上表格(引自引用文章), 处理到Excel中为:
如遇xlsx文件低版本office打不开请使用xls格式. 希望能帮助到你.
import re
import docx
import xlwt
from numpy import *
# 改写Excel
import xlrd
from xlutils.copy import copy
docFile = 'source.docx'
xlsFile = 'des.xlsx'
# 获取表格标题 自行修改规则
def get_tables_names(doc_paragraphs, heading_level):
tables_list = []
list_null = []
# 根据标题等级选
# for paragraph in document.paragraphs:
# if paragraph.style.name == 'Heading 1':
# print(paragraph.text)
pass
# 删除空标题
for i in range(0, len(doc_paragraphs)):
if len(doc_paragraphs[i].text) == 0:
list_null.append(doc_paragraphs[i])
for i in range(0, len(list_null)):
doc_paragraphs.remove(list_null[i])
print("检测到%d个标题" % (len(doc_paragraphs)))
for i in range(0, len(doc_paragraphs)):
print(str(i + 1) + ' ' + doc_paragraphs[i].text)
try:
table_name = re.findall('[A-Za-z_]+', doc_paragraphs[i].text)[0]
except IndexError:
table_name = "当前不符合规则" + str(i + 1)
print(table_name)
tables_list.append(table_name)
return tables_list
# 获取单个表 返回二维list
def get_table(doc_tables, index):
table = doc_tables[index]
table_cells = []
b_merge = False
i = 0
merge_cell = []
for row in table.rows:
cur_row = []
j = 0
for cell in row.cells:
if cell in cur_row:
cur_row.append(None)
merge_cell.append([i, j])
else:
cur_row.append(cell)
j += 1
table_cells.append(cur_row)
i += 1
i = 0
for col in table.columns:
cur_col = []
j = 0
for cell in col.cells:
if cell in cur_col:
merge_cell.append([j, i])
else:
cur_col.append(cell)
j += 1
i += 1
# print(merge_cell)
for index in merge_cell:
table_cells[index[0]][index[1]] = None
table_cells_text = []
for row in table_cells:
cur_row_text = []
for cell in row:
cur_row_text.append(cell.text if cell is not None else '')
table_cells_text.append(cur_row_text)
return table_cells_text
# 获取所有表
def get_tables_list(doc_tables):
tables_list = []
for i in range(0, len(doc_tables)):
tables_list.append(get_table(doc_tables, i))
return tables_list
# 将单个表格插入到某个表中
# 重新建表时需要手动删除原有的表!!!
def save_sheet(xls_name, sheet_name, table_cells):
arr_table = array(table_cells)
try:
rb = xlrd.open_workbook(xls_name)
book = copy(rb)
except FileNotFoundError:
book = xlwt.Workbook(encoding='utf-8')
sheet = book.add_sheet(sheet_name)
for i in range(0, arr_table.shape[0]):
for j in range(0, arr_table.shape[1]):
sheet.write(i, j, arr_table[i, j]) # 不带样式的写入
book.save(xls_name) # 保存文件
# 保存所有表
def save_all_sheets(xls_name, doc_tables, doc_paragraphs):
tables_names = get_tables_names(doc_paragraphs, 2)
try:
for i in range(0, len(doc_tables)):
if save_sheet(xls_name, tables_names[i], get_table(doc_tables, i)) is False:
print("保存失败")
return
except IndexError:
print("标题与表格数量不匹配")
print("保存成功")
def main():
doc_document = docx.Document(docFile)
doc_tables = doc_document.tables
doc_paragraphs = doc_document.paragraphs
save_all_sheets(xlsFile, doc_tables, doc_paragraphs)
if __name__ == '__main__':
main()