from docx.document import Document as _Document
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.table import _Cell, Table, _Row
from docx.text.paragraph import Paragraph
import docx
path = 'LL-A190311.docx'
doc = docx.Document(path)
def iter_block_items(parent):
if isinstance(parent, _Document):
parent_elm = parent.element.body
elif isinstance(parent, _Cell):
parent_elm = parent._tc
elif isinstance(parent, _Row):
parent_elm = parent._tr
else:
raise ValueError("something's not right")
for child in parent_elm.iterchildren():
if isinstance(child, CT_P):
yield Paragraph(child, parent)
elif isinstance(child, CT_Tbl):
yield Table(child, parent)
ls = []
def get_cell_content(cells):
"""
获取每一行中每一列的内容
"""
row_content = []
for cell in cells: # 遍历每一行的每一个单元格
# cell数量为表格最大列数+1,故对于较少列的行存在重复值,需去重
if cell.text and cell.text not in row_content:
row_content.append(cell.text)
return row_content
for block in iter_block_items(doc):
# read Paragraph
if isinstance(block, Paragraph):
# print(block.text)
ls.append(block.text)
# read table
elif isinstance(block, Table):
for x in block.rows:
# print(get_cell_content(x.cells))
ls.append(get_cell_content(x.cells))
# print(block.style.name)
print(ls)