#!/usr/bin/python
# -*- coding: UTF-8 -*-
from docx import Document
import os, sys
from functools import cache
def get_paragraphs(docx_path):
#打开word文档
document = Document(docx_path)
#获取所有段落
all_paragraphs = document.paragraphs
paragraph_texts = []
# 循环读取列表
for paragraph in all_paragraphs:
paragraph_texts.append(paragraph.text)
return paragraph_texts
def get_table_text(docx_path):
"""
获取Word文档中的表格内容
"""
result = []
document = Document(docx_path) #读入文件
tables = document.tables #获取Word文件中的表格集
for table in tables: # 遍历每个表格
table_content = []
for row in table.rows: # 从表格第一行开始循环读取表格数据
row_content = get_cell_content(row.cells)
table_content.append(row_content)
result.append(table_content)
return result
def get_cell_content(cells):
"""
获取每一行中每一列的内容
"""
row_content = []
for cell in cells: # 遍历每一行的每一个单元格
# cell数量为表格最大列数+1,故对于较少列的行存在重复值,需去重
if cell.text and cell.text not in row_content:
row_content.append(cell.text)
return row_content
def flat_list(lst,resultlt = []) -> list: # 递归函数解决列表多重嵌套问题
for item in lst:
if isinstance(item, list):
flat_list(item,resultlt)
else:
resultlt.append(item)
return resultlt
if __name__ == "__main__":
# 打开文件
chat = ""
paragraph_texts = get_paragraphs("docx_path") # 获取非表格数据
paragraph_texts = list(filter(None,paragraph_texts)) # 递归函数解决列表多重嵌套问题
listz = paragraph_texts
paragraph_texts = get_table_text("docx_path") # 获取表格数据
lists = flat_list(paragraph_texts) # 递归函数解决列表多重嵌套问题
# print(listz + lists)
for x in (listz + lists):
chat = chat + x + " "
print(chat)