上云替换脚本

最新推荐文章于 2024-07-12 16:16:27 发布

沈炎

最新推荐文章于 2024-07-12 16:16:27 发布

阅读量81

点赞数 3

文章标签： python

本文链接：https://blog.csdn.net/shen13414959064/article/details/136388974

版权

该篇文章介绍了如何使用Python脚本对SQL文件进行操作，包括从Excel文件读取参数和表名映射，然后在SQL语句中进行替换，以实现从本地到云端的表名变更和参数更新。

摘要由CSDN通过智能技术生成

import os
import re
from datetime import datetime
from tqdm import trange
import pandas as pd

path_sql_file      = "/u/users/vn55gxn/py_file/python_source_file_sql"
path_target_file   = "/u/users/vn55gxn/py_file/python_target_file"
path_excle_file    = "/u/users/vn55gxn/py_file/python_source_file_excle/python_repace.xlsx"


def repalce_file(path_sql_file, path_target_file, var_dict_mapping,table_dict_mapping):  #type0,默认，为参数替换，1，表替换

    #来源表读取部分
    pattern_from = r'from\s+(.*?)\s+'                                                                   #匹配from 后面两个空格之间的内容
    pattern_join = r'join\s+(.*?)\s+'
    # pattern_set_par = r'set(?!.*hive\.exec\.dynamic\.partition)'                                        #包含set 但是不是动态分区的参数
    pattern_set_par = r'set\s(?!.*hive\.exec\.dynamic\.partition)'                                        #包含set 但是不是动态分区的参数
    pattern_tmp = re.compile(r'^(?!.*(?:tmp|temp)).*$')                                                 # 过滤了tmp|temp 这些表,匹配不含有tmp的
    # pattern_set = re.compile(r'^(?!.*(?:set |SET )).*$')                                               #过滤set参数


    match_from_list =[]
    with  open(path_sql_file, "r") as file_from_source:
        from_lines = file_from_source.readlines()
        file_from_source.close()


        for line in from_lines:
            match_from = re.search(pattern_from,line)
            if match_from:
                if len(match_from.group(1))>10 and re.findall(pattern_tmp,match_from.group(1)):              #过滤一些不正确的匹配
                    match_from_list.append(match_from.group(1))
                    # print(match_from.group(1))

            match_join = re.search(pattern_join,line)
            if match_join:
                if len(match_join.group(1))>10 and re.findall(pattern_tmp,match_join.group(1)):              #过滤一些不正确的匹配
                    match_from_list.append(match_join.group(1))
                    # print(match_join.group(1))


    match_from_set=set(match_from_list)                                                                     #set去重
    match_from_news=['--' + s for s in match_from_set]                                                      #set拼接--
    # print(match_from_news)
    from_last_1=[]   #存放hive的来源表，拼接固定100长度
    from_last_2=[]   #存放云上的来源表
    from_last_11=[]    #存放hive的来源表
    for match_from_new in match_from_news:
        from_last_1.append(match_from_new.ljust(100))   #拼接够100个字符
        from_last_11.append(match_from_new)             #保留原始的数据，用作后面比较 1
        # print(from_last_1)
        for key, value in table_dict_mapping.items():
            pattern = re.compile(r"\b"+ re.escape(key)+ r"\b")          # 生成正则对象,\b,全词匹配
            match_from_new = pattern.sub(str(value), match_from_new)    # sub正则替换
        # print(match_from_new)
        from_last_2.append(match_from_new)
        # print('***********')
    # print(from_last)
    new_from_last =[a  + b +"\n" for a,b in zip(from_last_1,from_last_2)]
    # print(from_last_11)
    # print(from_last_2)

    from_last_3 = list(set(from_last_11) & set(from_last_2))            #打印没有替换成功的表

    if len(from_last_3) >0:
        print(path_target_file.ljust(100) + "没有匹配成功的表:")
        for i in from_last_3:
            print(" ".ljust(20)+i)
            # print("没有替换成功的表:"+i)
    else:
        print(path_target_file.ljust(100) + "表全部匹配成功的表:")

    with  open(path_sql_file, "r") as file_source:
        lines = file_source.readlines()
        file_source.close()

    with open(path_target_file, "w") as file_from_target:  # 不存在会自动创建
        file_from_target.write('')  # 写入前先清空目标文件

        for i in new_from_last:
            # print(i + "\n")  # 写入来源表
            file_from_target.write(i)

        for line in lines:
            # if "," not in line:                            #优化：带","的字段不匹配
            for key, value in var_dict_mapping.items():
                pattern = re.compile(re.escape(key))    # 参数列表生成正则对象
                line = pattern.sub(str(value), line)  # sub正则替换

            for key, value in table_dict_mapping.items():
                pattern = re.compile(r"\b"+ re.escape(key)+ r"\b")    # 参数列表生成正则对象,\b, 全词匹配
                line = pattern.sub(str(value), line)  # sub正则替换
            # print(line)
            # print(table_dict_mapping)
            match_set=re.search(pattern_set_par,line)
            if match_set:
                pass
            else:
                # print(line)
                file_from_target.write(line)
        file_from_target.close()


if __name__ == '__main__':

    # Python 字典(Dictionary) items() 函数以列表返回可遍历的(键, 值) 元组数组。
    # 读取excle 的数据映射成dict
    var_file_dataframe = pd.read_excel(path_excle_file,sheet_name='参数替换')
    var_file_dataframe =var_file_dataframe.fillna("")  # 处理excel表里面空白读取出来的nan，转换成空字符串""
    # dataframe 去重保留前面的第一行，优先取商品域上云的表
    var_dataframe_distinct = var_file_dataframe.drop_duplicates(subset = ['source'], keep ='first')
    var_dict_mapping = var_dataframe_distinct.set_index('source')['target'].to_dict()

    table_file_dataframe = pd.read_excel(path_excle_file,sheet_name= '表替换')
    table_file_dataframe =table_file_dataframe.fillna("")
    # dataframe 去重保留前面的第一行，优先取商品域上云的表
    table_dataframe_distinct = table_file_dataframe.drop_duplicates(subset = ['source'], keep ='first')
    table_dict_mapping = table_dataframe_distinct.set_index('source')['target'].to_dict()

    # print(dict_mapping)
    # print(table_dict_mapping)
    list_dir = os.listdir(path_sql_file)


    for i in list_dir:
        path_sql_file_dtl = path_sql_file + r"/" + i
        path_target_file_dtl = path_target_file + r"/" + i
        #替换开始
        strat = datetime.now()
        repalce_file(path_sql_file_dtl,path_target_file_dtl,var_dict_mapping,table_dict_mapping)  #参数替换，必须为第一个不能调换顺序，因为会先清空表
        edn = datetime.now()
        print(path_target_file_dtl.ljust(100) + " 已完成 用时：" + format((edn-strat).seconds) +"s")
    print("全部已完成。。。。。。")