python实现word内容替换

doc文件与docx文件不同

  • 存储方式的不同: doc 是二进制存储,docx是打包文件(docx文件可以解压,能看到里面的文件结构,主要是xml 等组成的打包文件);
  • docx易于跨平台,docx更小;
  • docx对于处理一些复杂对象比如公式、表格、图片更得心应手,因为可以通过xml的配置进行。

1. python-docx

#pip install python-docx
import docx
# 创建文档对象,获得word文档
doc = docx.Document(path)

#每一段的内容
for para in doc.paragraphs:
    print(para.text)

#每一段的编号、内容
for i in range(len(doc.paragraphs)):
    print(str(i), doc.paragraphs[i].text)
doc = docx.Document('test.docx')
for paragraph in doc.paragraphs:
    tmp = ''
    runs = paragraph.runs
    for i, run in enumerate(runs):
        tmp += run.text # 合并run字符串
        if '需要替换的字符串' in tmp:
            # 如果存在匹配得字符串,那么将当前得run替换成合并后得字符串
            run.text = run.text.replace(run.text, tmp)
            run.text = run.text.replace('需要替换的字符串', '我是替换后的字符串')
            tmp = ''
        else:
            # 如果没匹配到目标字符串则把当前run置空
            run.text = run.text.replace(run.text, '')
        if i == len(runs) - 1:
            # 如果是当前段落一直没有符合规则得字符串直接将当前run替换为tmp
            run.text = run.text.replace(run.text, tmp)

def docx_inplace_replace(file):
    file_ = rreplace(file, '.docx', '', 1)
    new_file = file_ + '_.docx'
    doc = docx.Document(file)
    for paragraph in doc.paragraphs:
        runs = paragraph.runs
        for i, run in enumerate(runs):
            tmp = run.text
            tmp = re.sub("\s+", " ", tmp)
            sensitive_datas = re_tmp(tmp)
            names = name_identify([tmp])
            if len(sensitive_datas) > 0:
                names = names + sensitive_datas
            if len(names) > 0:
                # 如果存在匹配得字符串,那么将当前得run替换成合并后得字符串
                for name in names:
                    tmp = tmp.replace(name, 'X'*len(name))
                run.text = run.text.replace(run.text, tmp)
    # 遍历所有表格的单元格
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                for paragraph in cell.paragraphs:
                    for run in paragraph.runs:
                        tmp = run.text
                        tmp = re.sub("\s+", " ", tmp)
                        sensitive_datas = re_tmp(tmp)
                        names = name_identify([tmp])
                        if len(sensitive_datas) >0:
                            names = names + sensitive_datas
                        if len(names) > 0:
                            # 如果存在匹配得字符串,那么将当前得run替换成合并后得字符串
                            for name in names:
                                tmp = tmp.replace(name, 'X'*len(name))
                        run.text = tmp
        # 保存文档
    doc.save(new_file)
    remove_header_footer(new_file, new_file)
    return new_file

2. 解压处理xml 数据

import zipfile
import os
import re
import tempfile
import shutil
from functools import reduce
#1. 获取xml 字符串
def getXml(docxFilename):
    zip = zipfile.ZipFile(open(docxFilename,"rb"))
    xmlString = zip.read("word/document.xml")
    return xmlString.decode('utf-8')
#2.  TODO 对xml 字符串进行替换处理

#3. 封装回docx 文件
def createNewDocx(originalDocx,xmlContent,newFilename):
    
    """ Create a temp directory, expand the original docx zip.
            Write the modified xml to word/document.xml
            Zip it up as the new docx
        """
    tmpDir = tempfile.mkdtemp()
    zip = zipfile.ZipFile(open(originalDocx,"rb"))
    zip.extractall(tmpDir)
    with open(os.path.join(tmpDir,"word/document.xml"),"w" ,encoding='utf-8') as f:
        f.write(xmlContent)
    # Get a list of all the files in the original docx zipfile
    filenames = zip.namelist()
    # Now, create the new zip file and add all the filex into the archive
    zipCopyFilename = newFilename
    with zipfile.ZipFile(zipCopyFilename,"w") as docx:
        for filename in filenames:
            docx.write(os.path.join(tmpDir,filename),filename)
    # Clean up the temp dir
    shutil.rmtree(tmpDir)
  • 2
    点赞
  • 24
    收藏
    觉得还不错? 一键收藏
  • 5
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值