今天突然有一个需求,要把统计局网站下载的Word文档里的表格提取出来,放到Excel表中,便于下一步进行数据分析。
1. 引入扩展库
# -*- coding: utf-8 -*-
import docx
from docx import Document
import xlwt;
import xlrd;
import glob
2. 读取Word文档中的表格
def readdoc(filename):
doc = docx.Document(filename)
tables = []
for table in doc.tables:
table_temp = []
for row in table.rows:
row_temp = []
for cell in row.cells:
row_temp.append(cell.text)
table_temp.append(row_temp)
tables.append(table_temp)
return tables
3. 写入Excel文件
def writeExcel(tables,filename):
Sheet_index = 0
workbook = xlwt.Workbook(encoding='utf-8')
for table in tables:
worksheet = workbook.add_sheet('sheet' + str(Sheet_index),cell_overwrite_ok = True)
Sheet_index = Sheet_index + 1
for rows in table:
r = table.index(rows)
for cell in rows:
c = rows.index(cell)
print(r,c,cell)
worksheet.write(r,c,cell)
workbook.save(filename[:-5] + ".xls")
4. 遍历目录下所有docx文件,并生成同名Excel文件
filenames = glob.glob("jtdoc/*.docx")
for filename in filenames:
tables = readdoc(filename)
writeExcel(tables,filename)