python自动生成hive sql脚本

最新推荐文章于 2021-09-27 21:06:07 发布
理科班的习习同学
最新推荐文章于 2021-09-27 21:06:07 发布
阅读量837
点赞数
分类专栏：脚本文章标签： python hive
本文链接：https://blog.csdn.net/qq_41448372/article/details/89709635
版权
本文介绍如何利用Python编程动态创建和执行Hive SQL脚本，从而提高数据处理的工作效率。通过构建函数和模板，我们可以自定义SQL查询，实现对大规模数据集的自动化分析。
摘要由CSDN通过智能技术生成
# -*- coding: utf-8 -*- 
# -*- coding: utf-8 -*- 
import os
import csv
import xlrd
from datetime import datetime, timedelta
import copy

def convert_xlsx_to_cvs(param_src_file_path, param_obj_file_path):
    """
    转换文件格式
    :param param_src_file_path: 源文件路径
    :param param_obj_file_path: 目标文件路径
    :return: nothing
    """
    for file in os.listdir(param_src_file_path):
        if file.split('.')[1] != "xlsx":
            print("文件过滤")
            continue
        workbook = xlrd.open_workbook("{0}{1}".format(param_src_file_path, file))
        sheet_table = workbook.sheet_by_index(0)

        # file create
        file_name = os.path.splitext(file)[0]
        if '-' in file_name:
            file_name = file_name.split('-')[1]
        file_path = '{1}{0}.cvs'.format(file_name, param_obj_file_path)
        file_path = file_path.replace(' ', '')
        if os.path.exists(file_path):
            os.remove(file_path)

        # convert
        with open(file_path, 'w', encoding='utf-8') as inner_file:
            write = csv.writer(inner_file)
            for row_item in range(sheet_table.nrows):
                row_value = sheet_table.row_values(row_item)
                write.writerow(row_value)


def clean_cvs_file_content(param_file_path):
    """
    清洗csv 文件内容
    :param param_file_path:
    :return:
    """
    for file in os.listdir(param_file_path):
        file_content = ""
        # 第一遍清理
        with open("{0}{1}".format(param_file_path, file), 'r', encoding='utf-8') as inner_file:
            skip_flag = False
            for line in inner_file.readlines():
                # 清理空行
                if line == '\n':
                    continue

                # 清理无数据的行
                if line == ",,,,,,,\n":
                    continue
                if line == ",,,,,,,,,\n":
                    continue
                if line == ",,,,,,,,\n":
                    continue
                if line == ",,,,,,\n":
                    continue

                # 清理合并最后一行
                if line.count(',') < 7:
                    line = line.replace('\n', ' ')
                    skip_flag = True
                else:
                    if skip_flag:
                        line = '\n' + line
                        skip_flag = False
                file_content += line

        # 第二遍清理
        file_content_copy = ""
        for line in file_content.split('\n'):
            line += '\n'
            # 清理合并异常行
            if line.count(',') < 7:
                ctrl_index = file_content_copy.rfind('\n')
                file_content_copy = file_content_copy[0:ctrl_index]
            file_content_copy += line

        # 第三遍清洗
        file_content_copy_third = ""
        for line in file_content_copy.split('\n'):
            line += '\n'
            # 替换特殊合并字符
            line_split = line.split(',')
            join_line = ""
            for inner_line in line_split:
                if ("\"decimal" in inner_line) or ("\"number" in inner_line) or ('date_format' in inner_line) or ('concat' in inner_line):
                    join_line += inner_line
                    join_line += '@'
                elif '\n' in inner_line:
                    join_line += inner_line
                else:
                    join_line += inner_line
                    join_line += ','
            line = join_line
            file_content_copy_third += line
        with open("{0}{1}".format(param_file_path, file), 'w', encoding='utf-8') as inner_file:
            inner_file.write(file_content_copy_third)


def analyze_file_key_content(param_file_path):
    """
    文件分析
    :param param_file_path: 文件路径
    :return:
    """
    list_ret = []
    for file in os.listdir(param_file_path):
        if file.split('.')[1] != "cvs":
            print("文件过滤")
            continue
        file_path = "{0}{1}".format(param_file_path, file)
        # 目标英文表名
        target_table = None
        # 目标中文表名
        target_table_desc = None
        # 源表名
        src_table = None
        # 加載策略
        load_strategy = None
        # 目标表主键
        target_table_key = None
        # 分区字段
        party_key = None
        with open(file_path, 'r', encoding='utf-8') as inner_file:
            for line in inner_file.readlines():
                line_split = line.split(',')
                line_split_quotation = line.split('\"')
                if line_split[0] == "目标英文表名":
                    target_table = line_split[1]
                if line_split[0] == "目标中文表名":
                    target_table_desc = line_split[1]
                if line_split[0] == "源表名":
                    src_table = line_split[1]
                if line_split[0] == "加载策略":
                    load_strategy = line_split[1]
                if line_split[0] == "目标表主键":
                    if len(line_split_quotation) > 1:
                        target_table_key = line_split_quotation[1]
                    else:
                        target_table_key = line_split[1]
                if line_split[0] == "分区字段":
                    if len(line_split_quotation) > 1:
                        party_key = line_split_quotation[1]
                    else:
                        party_key = line_split[1]
        list_ret.append({"target_table": target_table,
                         "target_table_desc": target_table_desc,
                         "src_table": src_table,
                         "load_strategy": load_strategy,
                         "file_path": file_path,
                         "target_table_key": target_table_key,
                         "party_key": party_key})
    return list_ret


def analyze_file_content(param_key_content, param_target_file_path):
    """
    分析文件内容
    :param param_key_content: 文件关键内容
    :param param_target_file_path 目标路径
    :return:
    """
    for file_key_info in param_key_content:
        load_strategy = file_key_info["load_strategy"]
        if load_strategy == "F1 - Full Overwrite":
            analyze_file_f1_strategy(file_key_info, param_target_file_path)
        elif load_strategy == "F2 - Update/Insert":
            analyze_file_f2_strategy(file_key_info, param_target_file_path)
        elif load_strategy == "F3 - Append":
            analyze_file_f3_strategy(file_key_info, param_target_file_path)
            analyze_file_f3_strategy_full(file_key_info, param_target_file_path)


def analyze_file_annotate(file_key_info):
    """
    分析文件注釋
    :param file_key_info: 文件關鍵内容
    :return:
    """
    param_file_path = file_key_info["file_path"]

    # 表的对应关系
    with open(param_file_path, 'r', encoding='utf-8') as inner_file:
        line_content = inner_file.readlines()

        # 有效内容截取
        start_index = 0
        current_index = 0
        for line in line_content:
            line_split = line.split(',')
            if line_split[0] == "修改记录":
                start_index = current_index + 2
            current_index += 1
        line_version_annotate = line_content[start_index:]
    return line_version_annotate


def analyze_file_f1_strategy(param_key_content, param_target_file_path):
    """
    策略F1的文件内容解析
    :param param_key_content: 文件路径
    :param param_target_file_path 目标文件
    :return:
    """
    load_strategy = param_key_content["load_strategy"]
    target_table = param_key_content["target_table"]
    target_table_desc = param_key_content["target_table_desc"]
    src_table = param_key_content["src_table"]
    param_file_path = param_key_content["file_path"]
    target_table_key = param_key_content["target_table_key"]
    party_key = param_key_content["party_key"]

    # 分区字段数据清洗
    src_db = None

    # debug info
    print("\n******************************** 文件关键信息(开始) *************************************")
    print("文件路径:", param_file_path)
    print("目标英文表名:", target_table)
    print("目标中文表名:", target_table_desc)
    print("目标表主键:", target_table_key)
    print("源表名:", src_table)
    print("加载策略:", load_strategy)
    print("********************************* 文件关键信息(结束) ************************************\n")

    # 表的对应关系(不分 group 的情况下)
    with open(param_file_path, 'r', encoding='utf-8') as inner_file:
        # 原始内容
        line_content = inner_file.readlines()

        # group 分组计数(最多检测10组)
        group_count = 0
        for index in range(10):
            if "Group {0}".format(index) in ''.join(line_content):
                group_count += 1
        print("分组数量:{0}".format(group_count))

        # 注释信息
        sql_annotate = "-- ************************************** Base Info ************************************** \n"
        sql_annotate += "-- Target Table English Name:{0} \n".format(target_table)
        sql_annotate += "-- Target Table Chinese Name:{0} \n".format(target_table_desc)
        sql_annotate += "-- Create Date:{0} \n".format(datetime.now())

        # 待写入的语句信息
        sql_all_write = ""
        
        # sql 头部内容
        sql_content_pre = """
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode = 1000;
"""

        # 分组数大于等于1时
        if group_count > 0:
            for index in range(group_count):
                # sql 头部内容
                '''sql_content_pre = """
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode = 1000;
insert overwrite table data_lake.{0}
select
""".format(target_table)'''
                sql_content_join = """\nfrom """

                # 组内容拆分
                index += 1
                group_name = "Group {0}".format(index)
                end_name = "修改记录"
                if index < group_count:
                    end_name = "Group {0}".format(index + 1)
                print("开始处理: {0}".format(group_name))
                start_index = 0
                end_index = 0
                current_index = 0
                for line in line_content:
                    line_split = line.split(',')
                    if line_split[0] == group_name:
                        start_index = current_index
                    if line_split[0] == end_name:
                        end_index = current_index
                    current_index += 1
                effect_lines = line_content[start_index:end_index]

                # 源表名
                src_table = effect_lines[1].split(',')[1]
                print("源表名:{0}".format(src_table))
                
                # 组内 sql info
                '''if index <= 1:
                    if party_key.strip() == '':
                        sql_content_pre = "insert overwrite table data_lake.{0} \n" \
                                          "select \n".format(target_table)
                    else:
                        sql_content_pre = "insert overwrite table data_lake.{0} \n" \
                                           "partition({1}) \n"\
                                           "select \n".format(target_table, party_key)               
                else:
                    sql_content_pre = "union all\n" \
                                      "select \n".format(target_table)'''      

                if party_key.strip() == '':
                    sql_content_pre = "set hive.exec.dynamic.partition=true;\n"\
                    "set hive.exec.dynamic.partition.mode=nonstrict;\n"\
                    "set hive.exec.max.dynamic.partitions.pernode = 1000;\n"\
                    "insert overwrite table data_lake.{0} \n" \
                                      "select \n".format(target_table)
                else:
                    sql_content_pre = "set hive.exec.dynamic.partition=true;\n"\
                    "set hive.exec.dynamic.partition.mode=nonstrict;\n"\
                    "set hive.exec.max.dynamic.partitions.pernode = 1000;\n"\
                    "insert overwrite table data_lake.{0} \n" \
                    "partition({1}) \n"\
                    "select \n".format(target_table, party_key)
                
                # 有效的数据内容
                group_lines = effect_lines[4:]
                join_index = 0
                for line in group_lines:
                    if line.split(',')[0] == "关联条件":
                        break
                    join_index += 1
                group_effect_lines = group_lines[:join_index]
                end_index = 0
                for line in group_lines:
                    if line.split(',')[0] == "条件语句（Where / Group By / Having)":
                        break
                    end_index += 1
                connect_lines = group_lines[join_index + 2:end_index]

                # 组内容(调试)
                print(group_effect_lines)
                print(connect_lines)

                # 注释拼接
                sql_content_pre = "-- ************************************************************************** \n" \
                                  + sql_content_pre
                sql_content_pre = "--   Group {0}: {1} \n".format(index, target_table.replace('\n', '')) \
                                  + sql_content_pre
                sql_content_pre = "\n-- ************************************************************************** \n" \
                                  + sql_content_pre

                # 有效语句拼接
                for line in group_effect_lines:
                    line_split = line.split(',')

                    # 源库
                    if not src_db:
                        src_db = 'sdata_full'

                    # 内容拼接
                    if line_split[0] == "" or line_split[1] == "":
                        reflect_data = line_split[7].replace('\n', '')
                        if reflect_data == "":
                            reflect_data = "\'\'"
                        if ('-' not in reflect_data) and (' ' not in reflect_data) and reflect_data[-1] == '\'' and \
                                reflect_data[0] != '\'':
                            reflect_data = '\'' + reflect_data
                        
                        if ("decimal" in line_split[6]) and ('@' in line_split[6]):
                            line_split[6] = "decimal(38,10)"
                        
                        if 'etl_dt' in line_split[4].lower():
                            reflect_data = 'current_timestamp()'
                        
                        if 'date_format' in reflect_data.lower():
                            reflect_data = reflect_data.replace('"','')
                            reflect_data = reflect_data.replace('@',',')
                            reflect_data = "cast({0} as {1})".format(reflect_data, line_split[6])
                        
                        '''if reflect_data == "${TX_DATE}":
                            line_split[6] = line_split[6].replace('@',',')
                            line_split[6] = line_split[6].replace('"','')
                            reflect_data = "cast('{0}' as {1})".format(reflect_data, line_split[6])'''
                        sql_content_pre += "    {0}    as {1},\n".format(reflect_data, line_split[4])
                    else:
                        if 'case' in line_split[7]:
                            if line_split[7][0] == "\"":
                                tmp_key_list = list(line_split[7])
                                tmp_key_list[line_split[7].rfind('"')] = ''
                                tmp_key_list[line_split[7].find('"')] = ''
                                tmp_key = ''.join(tmp_key_list)
                                tmp_key = tmp_key.replace('\n', ' ')
                            else:
                                tmp_key = line_split[7].replace('\n', ' ')
                            tmp_key = tmp_key.replace(u"’", "\'")
                            if ' end' in tmp_key:
                                tmp_key = "({0})".format(tmp_key)
                            else:
                                tmp_key = "({0} end )".format(tmp_key)
                            tmp_key = tmp_key.replace('when ', ' when ')
                            tmp_key = tmp_key.replace('then ', ' then ')
                            tmp_key = tmp_key.replace('else ', ' else ')
                            tmp_key = tmp_key.replace('  ', ' ')
                        elif 'date_format' in line_split[7].lower():
                            tmp_key = line_split[7].replace('"',''