背景:从一个混有文字和多个表格的word文档里,提取表格中有效信息
代码:
from docx import Document
import numpy as np
import pandas as pd
#读取文件
doc = Document("文件名.docx")
#读取表格
tables = doc.tables
#print(len(tables))
rlt = []
flag = 0
for t in tables: #每一个表格
rows = t.rows
for r in rows: #每一行
cols = r.cells
for c in cols: #每一个单元格
if flag != 0:
rlt.append(c.text)
flag = 0
continue
if c.text == "不动产所有权人" or c.text == "不动产权属证明" or c.text == "项目名称" or c.text == "项目地址":
flag = 1
nums = len(rlt)
rlt = np.array(rlt).reshape((nums//4,4))
#print(rlt)
df = pd.DataFrame(rlt,columns= ["不动产所有权人" ,"不动产权属证明" ,"项目名称","项目地址"])
#print(df)
df.to_excel('rlt.xlsx')