docx是一个包含文档XML的zip文件.您可以打开zip,阅读文档并使用ElementTree解析数据.
这种技术的优点是你不需要安装任何额外的python库.
import zipfile
import xml.etree.ElementTree
WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
PARA = WORD_NAMESPACE + 'p'
TEXT = WORD_NAMESPACE + 't'
TABLE = WORD_NAMESPACE + 'tbl'
ROW = WORD_NAMESPACE + 'tr'
CELL = WORD_NAMESPACE + 'tc'
with zipfile.ZipFile('') as docx:
tree = xml.etree.ElementTree.XML(docx.read('word/document.xml'))
for table in tree.iter(TABLE):
for row in table.iter(ROW):
for cell in row.iter(CELL):
print ''.join(node.text for node in cell.iter(TEXT))