import os
# import docx
from docx import Document
from docx.document import Document as DO
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
def get_paragraphs(docx_path):
#打开word文档
paragraph_texts = []
paragraph_texts.append(docx_path.text)
return paragraph_texts
# document = Document(docx_path)
# # #获取所有段落
# all_paragraphs = document.paragraphs
# paragraph_texts = []
# # 循环读取列表
# for paragraph in all_paragraphs:
# print(paragraph)
# paragraph_texts.append(paragraph.text)
# return paragraph_texts
def get_table_text(docx_path):
"""
获取Word文档中的表格内容
"""
# result = []
# result.append(docx_path.text)
# return result
result = []
document = Document(docx_path) #读入文件
print(document)
# return result
def iter_block_items(parent): # 获取内容的格式
"""
Yield each paragraph and table child within *parent*, in document order.
Each returned value is an instance of either Table or Paragraph. *parent*
would most commonly be a reference to a main Document object, but
also works for a _Cell object, which itself can contain paragraphs and tables.
"""
if isinstance(parent,DO):
parent_elm = parent.element.body
elif isinstance(parent, _Cell):
parent_elm = parent._tc
else:
raise ValueError("something's not right")
for child in parent_elm.iterchildren():
if isinstance(child, CT_P):
yield Paragraph(child, parent)
elif isinstance(child, CT_Tbl):
yield Table(child, parent)
def get_cell_content(cells):
"""
获取每一行中每一列的内容
"""
row_content = []
for cell in cells: # 遍历每一行的每一个单元格
# cell数量为表格最大列数+1,故对于较少列的行存在重复值,需去重
if cell.text and cell.text not in row_content:
row_content.append(cell.text)
return row_content
def read_table(table):
return [[cell.text for cell in row.cells] for row in table.rows]
def read_word(word_path):
doc = Document(word_path)
# print(doc)
# print(x for x in iter_block_items(doc))
for block in iter_block_items(doc):
print(block)
if isinstance(block, Paragraph):
print("text", [block.text])
elif isinstance(block, Table):
print("table", read_table(block))
if __name__ == "__main__":
path = "."
dir_ma = os.listdir(path)
si = ""
k = 0
for docx in dir_ma:
if docx[-1] != 'x':
continue
doc = Document(docx)
ls = []
LL = []
for block in iter_block_items(doc):
if isinstance(block, Paragraph):
# get_paragraphs(block)
ls.append(block.text)
elif isinstance(block, Table):
for x in read_table(block):
ls = ls + list(set(x))
for z in ls:
if z == '' or z == '\t':
continue
LL.append(z)
for z in LL[12:-1]:
si = si + z + " "
k = k + 1
with open('txt/'+ docx[0:-5]+".txt",'w',encoding='utf-8') as fi:
fi.write(si)