#pip install python-docximport docx
# 创建文档对象,获得word文档
doc = docx.Document(path)#每一段的内容for para in doc.paragraphs:print(para.text)#每一段的编号、内容for i inrange(len(doc.paragraphs)):print(str(i), doc.paragraphs[i].text)
doc = docx.Document('test.docx')for paragraph in doc.paragraphs:
tmp =''
runs = paragraph.runs
for i, run inenumerate(runs):
tmp += run.text # 合并run字符串if'需要替换的字符串'in tmp:# 如果存在匹配得字符串,那么将当前得run替换成合并后得字符串
run.text = run.text.replace(run.text, tmp)
run.text = run.text.replace('需要替换的字符串','我是替换后的字符串')
tmp =''else:# 如果没匹配到目标字符串则把当前run置空
run.text = run.text.replace(run.text,'')if i ==len(runs)-1:# 如果是当前段落一直没有符合规则得字符串直接将当前run替换为tmp
run.text = run.text.replace(run.text, tmp)
defdocx_inplace_replace(file):
file_ = rreplace(file,'.docx','',1)
new_file = file_ +'_.docx'
doc = docx.Document(file)for paragraph in doc.paragraphs:
runs = paragraph.runs
for i, run inenumerate(runs):
tmp = run.text
tmp = re.sub("\s+"," ", tmp)
sensitive_datas = re_tmp(tmp)
names = name_identify([tmp])iflen(sensitive_datas)>0:
names = names + sensitive_datas
iflen(names)>0:# 如果存在匹配得字符串,那么将当前得run替换成合并后得字符串for name in names:
tmp = tmp.replace(name,'X'*len(name))
run.text = run.text.replace(run.text, tmp)# 遍历所有表格的单元格for table in doc.tables:for row in table.rows:for cell in row.cells:for paragraph in cell.paragraphs:for run in paragraph.runs:
tmp = run.text
tmp = re.sub("\s+"," ", tmp)
sensitive_datas = re_tmp(tmp)
names = name_identify([tmp])iflen(sensitive_datas)>0:
names = names + sensitive_datas
iflen(names)>0:# 如果存在匹配得字符串,那么将当前得run替换成合并后得字符串for name in names:
tmp = tmp.replace(name,'X'*len(name))
run.text = tmp
# 保存文档
doc.save(new_file)
remove_header_footer(new_file, new_file)return new_file
2. 解压处理xml 数据
import zipfile
import os
import re
import tempfile
import shutil
from functools importreduce#1. 获取xml 字符串defgetXml(docxFilename):zip= zipfile.ZipFile(open(docxFilename,"rb"))
xmlString =zip.read("word/document.xml")return xmlString.decode('utf-8')#2. TODO 对xml 字符串进行替换处理#3. 封装回docx 文件defcreateNewDocx(originalDocx,xmlContent,newFilename):""" Create a temp directory, expand the original docx zip.
Write the modified xml to word/document.xml
Zip it up as the new docx
"""
tmpDir = tempfile.mkdtemp()zip= zipfile.ZipFile(open(originalDocx,"rb"))zip.extractall(tmpDir)withopen(os.path.join(tmpDir,"word/document.xml"),"w",encoding='utf-8')as f:
f.write(xmlContent)# Get a list of all the files in the original docx zipfile
filenames =zip.namelist()# Now, create the new zip file and add all the filex into the archive
zipCopyFilename = newFilename
with zipfile.ZipFile(zipCopyFilename,"w")as docx:for filename in filenames:
docx.write(os.path.join(tmpDir,filename),filename)# Clean up the temp dir
shutil.rmtree(tmpDir)