一、前言
部门接到一个新需求,要求根据客户提供的文档,提取相关信息(如下图所示)
以#开头的段落保存在excel文档第一列
以日期开头的段落保存在excel文档第二列
代码如下
import re
import xlsxwriter
from docx import Document #除了需要安装docx模块还需安装python-docx模块,否则会报错的
doc=Document("C:/Users/SR/IdeaProjects/a/src/main/resources/1.docx") #打开word文档
for p in doc.paragraphs:
if p.style.name=='Heading 3':
print(p.text)
#提取3级标题,并打印
for p in doc.paragraphs:
if re.match("^Heading \d+$",p.style.name):
e = p.text
with open('log.txt', mode='a', encoding='utf-8') as f:
print(e, file=f)
print(p.text)
f_open=open(r'log.txt',encoding='utf-8')
lines=f_open.readlines()
wb = xlsxwriter.Workbook("1.xls")
ws = wb.add_worksheet('test')
# ws.activate()
ws.write('A1','标题')
ws.write('B1','内容')
i = 1
j = 1
for p in lines:
if p[0] == '#':
# print(p)
ws.write(i,0,p)
i = i+1
for l in lines:
if l[0] != '#':
# print(l)
ws.write(j,1,l)
j = j+1
wb.close()
f.close()