python实现word内容替换

桂花很香,旭很美

已于 2022-11-01 11:46:42 修改

阅读量5.5k

点赞数 2

分类专栏： Python 文章标签： python

于 2022-10-28 18:50:44 首次发布

本文链接：https://blog.csdn.net/weixin_40959890/article/details/127576951

版权

Python 专栏收录该内容

168 篇文章 3 订阅

订阅专栏

doc文件与docx文件不同

存储方式的不同： doc 是二进制存储，docx是打包文件（docx文件可以解压，能看到里面的文件结构，主要是xml 等组成的打包文件）；
docx易于跨平台，docx更小；
docx对于处理一些复杂对象比如公式、表格、图片更得心应手，因为可以通过xml的配置进行。

1. python-docx

#pip install python-docx
import docx
# 创建文档对象,获得word文档
doc = docx.Document(path)

#每一段的内容
for para in doc.paragraphs:
    print(para.text)

#每一段的编号、内容
for i in range(len(doc.paragraphs)):
    print(str(i), doc.paragraphs[i].text)

doc = docx.Document('test.docx')
for paragraph in doc.paragraphs:
    tmp = ''
    runs = paragraph.runs
    for i, run in enumerate(runs):
        tmp += run.text # 合并run字符串
        if '需要替换的字符串' in tmp:
            # 如果存在匹配得字符串，那么将当前得run替换成合并后得字符串
            run.text = run.text.replace(run.text, tmp)
            run.text = run.text.replace('需要替换的字符串', '我是替换后的字符串')
            tmp = ''
        else:
            # 如果没匹配到目标字符串则把当前run置空
            run.text = run.text.replace(run.text, '')
        if i == len(runs) - 1:
            # 如果是当前段落一直没有符合规则得字符串直接将当前run替换为tmp
            run.text = run.text.replace(run.text, tmp)


def docx_inplace_replace(file):
    file_ = rreplace(file, '.docx', '', 1)
    new_file = file_ + '_.docx'
    doc = docx.Document(file)
    for paragraph in doc.paragraphs:
        runs = paragraph.runs
        for i, run in enumerate(runs):
            tmp = run.text
            tmp = re.sub("\s+", " ", tmp)
            sensitive_datas = re_tmp(tmp)
            names = name_identify([tmp])
            if len(sensitive_datas) > 0:
                names = names + sensitive_datas
            if len(names) > 0:
                # 如果存在匹配得字符串，那么将当前得run替换成合并后得字符串
                for name in names:
                    tmp = tmp.replace(name, 'X'*len(name))
                run.text = run.text.replace(run.text, tmp)
    # 遍历所有表格的单元格
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                for paragraph in cell.paragraphs:
                    for run in paragraph.runs:
                        tmp = run.text
                        tmp = re.sub("\s+", " ", tmp)
                        sensitive_datas = re_tmp(tmp)
                        names = name_identify([tmp])
                        if len(sensitive_datas) >0:
                            names = names + sensitive_datas
                        if len(names) > 0:
                            # 如果存在匹配得字符串，那么将当前得run替换成合并后得字符串
                            for name in names:
                                tmp = tmp.replace(name, 'X'*len(name))
                        run.text = tmp
        # 保存文档
    doc.save(new_file)
    remove_header_footer(new_file, new_file)
    return new_file

2. 解压处理xml 数据

import zipfile
import os
import re
import tempfile
import shutil
from functools import reduce
#1. 获取xml 字符串
def getXml(docxFilename):
    zip = zipfile.ZipFile(open(docxFilename,"rb"))
    xmlString = zip.read("word/document.xml")
    return xmlString.decode('utf-8')
#2.  TODO 对xml 字符串进行替换处理

#3. 封装回docx 文件
def createNewDocx(originalDocx,xmlContent,newFilename):
    
    """ Create a temp directory, expand the original docx zip.
            Write the modified xml to word/document.xml
            Zip it up as the new docx
        """
    tmpDir = tempfile.mkdtemp()
    zip = zipfile.ZipFile(open(originalDocx,"rb"))
    zip.extractall(tmpDir)
    with open(os.path.join(tmpDir,"word/document.xml"),"w" ,encoding='utf-8') as f:
        f.write(xmlContent)
    # Get a list of all the files in the original docx zipfile
    filenames = zip.namelist()
    # Now, create the new zip file and add all the filex into the archive
    zipCopyFilename = newFilename
    with zipfile.ZipFile(zipCopyFilename,"w") as docx:
        for filename in filenames:
            docx.write(os.path.join(tmpDir,filename),filename)
    # Clean up the temp dir
    shutil.rmtree(tmpDir)