import os
import re
from datetime import datetime
from tqdm import trange
import pandas as pd
path_sql_file = "/u/users/vn55gxn/py_file/python_source_file_sql"
path_target_file = "/u/users/vn55gxn/py_file/python_target_file"
path_excle_file = "/u/users/vn55gxn/py_file/python_source_file_excle/python_repace.xlsx"
def repalce_file(path_sql_file, path_target_file, var_dict_mapping,table_dict_mapping): #type0,默认,为参数替换,1,表替换
#来源表读取部分
pattern_from = r'from\s+(.*?)\s+' #匹配from 后面两个空格之间的内容
pattern_join = r'join\s+(.*?)\s+'
# pattern_set_par = r'set(?!.*hive\.exec\.dynamic\.partition)' #包含set 但是不是动态分区的参数
pattern_set_par = r'set\s(?!.*hive\.exec\.dynamic\.partition)' #包含set 但是不是动态分区的参数
pattern_tmp = re.compile(r'^(?!.*(?:tmp|temp)).*$') # 过滤了tmp|temp 这些表,匹配不含有tmp的
# pattern_set = re.compile(r'^(?!.*(?:set |SET )).*$') #过滤set参数
match_from_list =[]
with open(path_sql_file, "r") as file_from_source:
from_lines = file_from_source.readlines()
file_from_source.close()
for line in from_lines:
match_from = re.search(pattern_from,line)
if match_from:
if len(match_from.group(1))>10 and re.findall(pattern_tmp,match_from.group(1)): #过滤一些不正确的匹配
match_from_list.append(match_from.group(1))
# print(match_from.group(1))
match_join = re.search(pattern_join,line)
if match_join:
if len(match_join.group(1))>10 and re.findall(pattern_tmp,match_join.group(1)): #过滤一些不正确的匹配
match_from_list.append(match_join.group(1))
# print(match_join.group(1))
match_from_set=set(match_from_list) #set去重
match_from_news=['--' + s for s in match_from_set] #set拼接--
# print(match_from_news)
from_last_1=[] #存放hive的来源表,拼接固定100长度
from_last_2=[] #存放云上的来源表
from_last_11=[] #存放hive的来源表
for match_from_new in match_from_news:
from_last_1.append(match_from_new.ljust(100)) #拼接够100个字符
from_last_11.append(match_from_new) #保留原始的数据,用作后面比较 1
# print(from_last_1)
for key, value in table_dict_mapping.items():
pattern = re.compile(r"\b"+ re.escape(key)+ r"\b") # 生成正则对象,\b,全词匹配
match_from_new = pattern.sub(str(value), match_from_new) # sub正则替换
# print(match_from_new)
from_last_2.append(match_from_new)
# print('***********')
# print(from_last)
new_from_last =[a + b +"\n" for a,b in zip(from_last_1,from_last_2)]
# print(from_last_11)
# print(from_last_2)
from_last_3 = list(set(from_last_11) & set(from_last_2)) #打印没有替换成功的表
if len(from_last_3) >0:
print(path_target_file.ljust(100) + "没有匹配成功的表:")
for i in from_last_3:
print(" ".ljust(20)+i)
# print("没有替换成功的表:"+i)
else:
print(path_target_file.ljust(100) + "表全部匹配成功的表:")
with open(path_sql_file, "r") as file_source:
lines = file_source.readlines()
file_source.close()
with open(path_target_file, "w") as file_from_target: # 不存在会自动创建
file_from_target.write('') # 写入前先清空目标文件
for i in new_from_last:
# print(i + "\n") # 写入来源表
file_from_target.write(i)
for line in lines:
# if "," not in line: #优化:带","的字段不匹配
for key, value in var_dict_mapping.items():
pattern = re.compile(re.escape(key)) # 参数列表生成正则对象
line = pattern.sub(str(value), line) # sub正则替换
for key, value in table_dict_mapping.items():
pattern = re.compile(r"\b"+ re.escape(key)+ r"\b") # 参数列表生成正则对象,\b, 全词匹配
line = pattern.sub(str(value), line) # sub正则替换
# print(line)
# print(table_dict_mapping)
match_set=re.search(pattern_set_par,line)
if match_set:
pass
else:
# print(line)
file_from_target.write(line)
file_from_target.close()
if __name__ == '__main__':
# Python 字典(Dictionary) items() 函数以列表返回可遍历的(键, 值) 元组数组。
# 读取excle 的数据映射成dict
var_file_dataframe = pd.read_excel(path_excle_file,sheet_name='参数替换')
var_file_dataframe =var_file_dataframe.fillna("") # 处理excel表里面空白读取出来的nan,转换成空字符串""
# dataframe 去重保留前面的第一行,优先取商品域上云的表
var_dataframe_distinct = var_file_dataframe.drop_duplicates(subset = ['source'], keep ='first')
var_dict_mapping = var_dataframe_distinct.set_index('source')['target'].to_dict()
table_file_dataframe = pd.read_excel(path_excle_file,sheet_name= '表替换')
table_file_dataframe =table_file_dataframe.fillna("")
# dataframe 去重保留前面的第一行,优先取商品域上云的表
table_dataframe_distinct = table_file_dataframe.drop_duplicates(subset = ['source'], keep ='first')
table_dict_mapping = table_dataframe_distinct.set_index('source')['target'].to_dict()
# print(dict_mapping)
# print(table_dict_mapping)
list_dir = os.listdir(path_sql_file)
for i in list_dir:
path_sql_file_dtl = path_sql_file + r"/" + i
path_target_file_dtl = path_target_file + r"/" + i
#替换开始
strat = datetime.now()
repalce_file(path_sql_file_dtl,path_target_file_dtl,var_dict_mapping,table_dict_mapping) #参数替换,必须为第一个不能调换顺序,因为会先清空表
edn = datetime.now()
print(path_target_file_dtl.ljust(100) + " 已完成 用时:" + format((edn-strat).seconds) +"s")
print("全部已完成。。。。。。")
上云替换脚本
最新推荐文章于 2024-06-14 09:50:01 发布