我试图使用下面的代码从.docx获取文本,但问题是文本包含特殊字符(例如“ç”或“á”),并且代码没有正确地读取文件。在try:
from xml.etree.cElementTree import XML
except ImportError:
from xml.etree.ElementTree import XML
import zipfile
"""
Module that extract text from MS XML Word document (.docx).
(Inspired by python-docx )
"""
WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
PARA = WORD_NAMESPACE + 'p'
TEXT = WORD_NAMESPACE + 't'
def get_docx_text(path):
"""
Take the path of a docx file as argument, return the text in unicode.
"""
document = zipfile.ZipFile(path)
xml_content = document.read('word/document.xml')
document.close()
tree = XML(xml_content)
paragraphs = []