真;顺序读取docx文本

from docx.document import Document as _Document

from docx.oxml.text.paragraph import CT_P

from docx.oxml.table import CT_Tbl

from docx.table import _Cell, Table, _Row

from docx.text.paragraph import Paragraph

import docx

path = 'LL-A190311.docx'

doc = docx.Document(path)

def iter_block_items(parent):

    if isinstance(parent, _Document):

        parent_elm = parent.element.body

    elif isinstance(parent, _Cell):

        parent_elm = parent._tc

    elif isinstance(parent, _Row):

        parent_elm = parent._tr

    else:

        raise ValueError("something's not right")

    for child in parent_elm.iterchildren():

        if isinstance(child, CT_P):

            yield Paragraph(child, parent)

        elif isinstance(child, CT_Tbl):

            yield Table(child, parent)



 

ls = []

def get_cell_content(cells):

    """

        获取每一行中每一列的内容

    """

    row_content = []

    for cell in cells: # 遍历每一行的每一个单元格

        # cell数量为表格最大列数+1,故对于较少列的行存在重复值,需去重

        if cell.text and cell.text not in row_content:

            row_content.append(cell.text)

       

    return row_content

for block in iter_block_items(doc):

    # read Paragraph

    if isinstance(block, Paragraph):

        # print(block.text)

        ls.append(block.text)

    # read table

    elif isinstance(block, Table):

        for x in block.rows:

            # print(get_cell_content(x.cells))

            ls.append(get_cell_content(x.cells))

        # print(block.style.name)

print(ls)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值