如果有不同模式的文件,我不知道这是否有效(请注意,我使用的是python 2.7.11):# -*- coding: utf-8 -*-
from docx import Document
import sys
import os
import re
reload(sys)
sys.setdefaultencoding('utf8')
for root, dirs, files in os.walk("."):
for name in files:
doc_file = os.path.join(root, name)
if doc_file.endswith('docx'):
main_file = Document(doc_file)
table = main_file.tables[1] # this is same for every document
data = []
keys = None
for i, row in enumerate(table.rows):
text = (cell.text for cell in row.cells)
if i == 0:
keys = tuple(text)
continue
row_data = tuple(text)
data.append(row_data)
regexReference = re.compile("(C.-[0-9-]+)")
regexCoordinate = re.compile(r'(N-(