[Python]借助Word通配符对文件中的金融数字进行脱敏同时保留原格式

乱七八糟的,先记录一下,以后再整理。

并且遗留了部分数字(超过16位的)对应的通配符过长,Word不支持搜索替换的问题。

# -*- encoding: utf8 -*-


import datetime
import os
import re
from tqdm import tqdm
import datetime
import collections
import numpy as np
from docx import Document
import win32com.client


CommaNumberPattern = re.compile(u'\d{1,3}([,,]\d\d\d)+([.]\d\d)')
NumberPattern = re.compile(u'[0-9]')


if __name__ == "__main__":

    document = Document(r'./data/test.docx')

    rep_list = [re.search(CommaNumberPattern, paragraph.text).group() for paragraph in document.paragraphs if not re.search(CommaNumberPattern, paragraph.text) is None]

    for table in document.tables:
        for row in table.rows:
            for cell in row.cells:
                if not re.search(CommaNumberPattern, cell.text) is None:
                    rep_list.append(re.search(CommaNumberPattern, cell.text).group())

    num_len = set([len(rep_num) for rep_num in rep_list])
    print(num_len)

    num_pattern_dict = collections.OrderedDict()
    for num_len in sorted(list(num_len), reverse=True) :
        if num_len != 16:
            head_pattern = '[0-9]{{{}}}'.format((num_len - 3) % 4)
            body_pattern = '([,,][0-9]{3})'*int((num_len - 3) / 4)
            tail_pattern = '[.][0-9]{2}'
            num_pattern = '({}){}{}'.format(head_pattern, body_pattern, tail_pattern)
            encode_pattern = '1'*((num_len - 3) % 4)+',111'*int((num_len - 3) / 4)+'.11'
            num_pattern_dict[num_pattern] = encode_pattern
    print(num_pattern_dict)

    app = win32com.client.DispatchEx('Word.Application')
    app.Visible = False
    app.DisplayAlerts = False
    app.ScreenUpdating = False
    doc = app.Documents.Open(r'C:\Users\cn190441\PycharmProjects\KPMG_NLP\Num_replace\data\test.docx')
    print("open doc")
    time = datetime.datetime.now()
    app.Selection.Find.ClearFormatting()
    app.Selection.Find.Replacement.ClearFormatting()
    for num_pattern, encode_pattern in num_pattern_dict.items():
        app.Selection.Find.Execute(num_pattern, False, False, True,False, False, True, 1, True, encode_pattern, 2)
    # for old_str, new_str in tqdm(rep_dict.items()):  [0-9]{1,3}([,,][0-9])+([.][0-9][0-9])
    #     app.Selection.Find.Execute(old_str, False, False, False, False, False, True, 1, True, new_str, 2)
    print('Time Consumption: {}'.format(datetime.datetime.now() - time))
    app.ScreenUpdating = True
    doc.SaveAs('C:\\Users\\cn190441\\PycharmProjects\\KPMG_NLP\\Num_replace\\data\\test_rep.docx')
    doc.Close()
    app.Quit()

    # # 通配符尝试:([0-9]{1,3})([,,][0-9][0-9][0-9])@(.[0-9][0-9])   ([,,][0-9]{3}){2}[.][0-9]{2}

 

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值