遇见这个一个需求:几百万的数据,需要替换掉不同种类的敏感语句,需要在测试库中进行替换,成功后再替换生产库。
难点1:考虑即适用开发环境,又适用生产环境,考虑使用python(简洁,方便)
难点2:数据量太大;解决办法:查询时使用elasticsearch
难点3:替换种类多;解决办法:进行一定的封装
写的不算细致,可以替换部分规则的敏感语句。代码显示没有格式,需要自行 Reformat Code
具体代码如下:
# -*- coding: UTF-8 -*-
from elasticsearch import Elasticsearch
import pymysql
import re
import time
# Elasticsearch查找query匹配的结果
def getElasticsearchList(query_string=''):
es = Elasticsearch(
[
'http://192.168.1.166:9200',
]
)
res = es.search(index="test-questions", body={"query": {
"bool": {"must": [{"query_string": {"default_field": "test-question.content", "query": query_string}}],
"must_not": [],
"should": []}}, "from": 0, "size": 2})
print("Got %d Hits:" % res['hits']['total'])
# 查找匹配的字符 ,并以字典形式{id:content}返回
dict = {}
for hit in res['hits']['hits']:
dict[hit["_source"]['id']] = hit["_source"]['content']
return dict
#替换符合规则的字符
def deleteString(dict):
result = {}
for key in dict:
reg = r'alt=".*!"' #替换'alt'标签
strre = re.compile(reg)
str = re.sub(strre, ' ', dict[key])
result[key] = str
return result
#更新操作
conn = pymysql.connect(host='192.168.1.225', user='admin', passwd='neptune@admin', db='teachresource', port=3306,
charset='utf8')
cur = conn.cursor()
resultList = deleteString(dict)
for key in resultList:
cmd = "update sh_testquestion_bak set updateTime=%s,content =%s where id=%s"
cur.execute(
cmd, (int(time.time()) * 1000, resultList[key], key))
conn.commit()
conn.close()