乱七八糟的,先记录一下,以后再整理。
并且遗留了部分数字(超过16位的)对应的通配符过长,Word不支持搜索替换的问题。
# -*- encoding: utf8 -*-
import datetime
import os
import re
from tqdm import tqdm
import datetime
import collections
import numpy as np
from docx import Document
import win32com.client
CommaNumberPattern = re.compile(u'\d{1,3}([,,]\d\d\d)+([.]\d\d)')
NumberPattern = re.compile(u'[0-9]')
if __name__ == "__main__":
document = Document(r'./data/test.docx')
rep_list = [re.search(CommaNumberPattern, paragraph.text).group() for paragraph in document.paragraphs if not re.search(CommaNumberPattern, paragraph.text) is None]
for table in document.tables:
for row in table.rows:
for cell in row.cells:
if not re.search(CommaNumberPattern, cell.text) is None:
rep_list.append(re.search(CommaNumberPattern, cell.text).group())
num_len = set([len(rep_num) for rep_num in rep_list])
print(num_len)
num_pattern_dict = collections.OrderedDict()
for num_len in sorted(list(num_len), reverse=True) :
if num_len != 16:
head_pattern = '[0-9]{{{}}}'.format((num_len - 3) % 4)
body_pattern = '([,,][0-9]{3})'*int((num_len - 3) / 4)
tail_pattern = '[.][0-9]{2}'
num_pattern = '({}){}{}'.format(head_pattern, body_pattern, tail_pattern)
encode_pattern = '1'*((num_len - 3) % 4)+',111'*int((num_len - 3) / 4)+'.11'
num_pattern_dict[num_pattern] = encode_pattern
print(num_pattern_dict)
app = win32com.client.DispatchEx('Word.Application')
app.Visible = False
app.DisplayAlerts = False
app.ScreenUpdating = False
doc = app.Documents.Open(r'C:\Users\cn190441\PycharmProjects\KPMG_NLP\Num_replace\data\test.docx')
print("open doc")
time = datetime.datetime.now()
app.Selection.Find.ClearFormatting()
app.Selection.Find.Replacement.ClearFormatting()
for num_pattern, encode_pattern in num_pattern_dict.items():
app.Selection.Find.Execute(num_pattern, False, False, True,False, False, True, 1, True, encode_pattern, 2)
# for old_str, new_str in tqdm(rep_dict.items()): [0-9]{1,3}([,,][0-9])+([.][0-9][0-9])
# app.Selection.Find.Execute(old_str, False, False, False, False, False, True, 1, True, new_str, 2)
print('Time Consumption: {}'.format(datetime.datetime.now() - time))
app.ScreenUpdating = True
doc.SaveAs('C:\\Users\\cn190441\\PycharmProjects\\KPMG_NLP\\Num_replace\\data\\test_rep.docx')
doc.Close()
app.Quit()
# # 通配符尝试:([0-9]{1,3})([,,][0-9][0-9][0-9])@(.[0-9][0-9]) ([,,][0-9]{3}){2}[.][0-9]{2}