# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import os
import csv
import xlrd
from datetime import datetime, timedelta
import copy
def convert_xlsx_to_cvs(param_src_file_path, param_obj_file_path):
"""
转换文件格式
:param param_src_file_path: 源文件路径
:param param_obj_file_path: 目标文件路径
:return: nothing
"""
for file in os.listdir(param_src_file_path):
if file.split('.')[1] != "xlsx":
print("文件过滤")
continue
workbook = xlrd.open_workbook("{0}{1}".format(param_src_file_path, file))
sheet_table = workbook.sheet_by_index(0)
# file create
file_name = os.path.splitext(file)[0]
if '-' in file_name:
file_name = file_name.split('-')[1]
file_path = '{1}{0}.cvs'.format(file_name, param_obj_file_path)
file_path = file_path.replace(' ', '')
if os.path.exists(file_path):
os.remove(file_path)
# convert
with open(file_path, 'w', encoding='utf-8') as inner_file:
write = csv.writer(inner_file)
for row_item in range(sheet_table.nrows):
row_value = sheet_table.row_values(row_item)
write.writerow(row_value)
def clean_cvs_file_content(param_file_path):
"""
清洗csv 文件内容
:param param_file_path:
:return:
"""
for file in os.listdir(param_file_path):
file_content = ""
# 第一遍清理
with open("{0}{1}".format(param_file_path, file), 'r', encoding='utf-8') as inner_file:
skip_flag = False
for line in inner_file.readlines():
# 清理空行
if line == '\n':
continue
# 清理无数据的行
if line == ",,,,,,,\n":
continue
if line == ",,,,,,,,,\n":
continue
if line == ",,,,,,,,\n":
continue
if line == ",,,,,,\n":
continue
# 清理合并最后一行
if line.count(',') < 7:
line = line.replace('\n', ' ')
skip_flag = True
else:
if skip_flag:
line = '\n' + line
skip_flag = False
file_content += line
# 第二遍清理
file_content_copy = ""
for line in file_content.split('\n'):
line += '\n'
# 清理合并异常行
if line.count(',') < 7:
ctrl_index = file_content_copy.rfind('\n')
file_content_copy = file_content_copy[0:ctrl_index]
file_content_copy += line
# 第三遍清洗
file_content_copy_third = ""
for line in file_content_copy.split('\n'):
line += '\n'
# 替换特殊合并字符
line_split = line.split(',')
join_line = ""
for inner_line in line_split:
if ("\"decimal" in inner_line) or ("\"number" in inner_line) or ('date_format' in inner_line) or ('concat' in inner_line):
join_line += inner_line
join_line += '@'
elif '\n' in inner_line:
join_line += inner_line
else:
join_line += inner_line
join_line += ','
line = join_line
file_content_copy_third += line
with open("{0}{1}".format(param_file_path, file), 'w', encoding='utf-8') as inner_file:
inner_file.write(file_content_copy_third)
def analyze_file_key_content(param_file_path):
"""
文件分析
:param param_file_path: 文件路径
:return:
"""
list_ret = []
for file in os.listdir(param_file_path):
if file.split('.')[1] != "cvs":
print("文件过滤")
continue
file_path = "{0}{1}".format(param_file_path, file)
# 目标英文表名
target_table = None
# 目标中文表名
target_table_desc = None
# 源表名
src_table = None
# 加載策略
load_strategy = None
# 目标表主键
target_table_key = None
# 分区字段
party_key = None
with open(file_path, 'r', encoding='utf-8') as inner_file:
for line in inner_file.readlines():
line_split = line.split(',')
line_split_quotation = line.split('\"')
if line_split[0] == "目标英文表名":
target_table = line_split[1]
if line_split[0] == "目标中文表名":
target_table_desc = line_split[1]
if line_split[0] == "源表名":
src_table = line_split[1]
if line_split[0] == "加载策略":
load_strategy = line_split[1]
if line_split[0] == "目标表主键":
if len(line_split_quotation) > 1:
target_table_key = line_split_quotation[1]
else:
target_table_key = line_split[1]
if line_split[0] == "分区字段":
if len(line_split_quotation) > 1:
party_key = line_split_quotation[1]
else:
party_key = line_split[1]
list_ret.append({"target_table": target_table,
"target_table_desc": target_table_desc,
"src_table": src_table,
"load_strategy": load_strategy,
"file_path": file_path,
"target_table_key": target_table_key,
"party_key": party_key})
return list_ret
def analyze_file_content(param_key_content, param_target_file_path):
"""
分析文件内容
:param param_key_content: 文件关键内容
:param param_target_file_path 目标路径
:return:
"""
for file_key_info in param_key_content:
load_strategy = file_key_info["load_strategy"]
if load_strategy == "F1 - Full Overwrite":
analyze_file_f1_strategy(file_key_info, param_target_file_path)
elif load_strategy == "F2 - Update/Insert":
analyze_file_f2_strategy(file_key_info, param_target_file_path)
elif load_strategy == "F3 - Append":
analyze_file_f3_strategy(file_key_info, param_target_file_path)
analyze_file_f3_strategy_full(file_key_info, param_target_file_path)
def analyze_file_annotate(file_key_info):
"""
分析文件注釋
:param file_key_info: 文件關鍵内容
:return:
"""
param_file_path = file_key_info["file_path"]
# 表的对应关系
with open(param_file_path, 'r', encoding='utf-8') as inner_file:
line_content = inner_file.readlines()
# 有效内容截取
start_index = 0
current_index = 0
for line in line_content:
line_split = line.split(',')
if line_split[0] == "修改记录":
start_index = current_index + 2
current_index += 1
line_version_annotate = line_content[start_index:]
return line_version_annotate
def analyze_file_f1_strategy(param_key_content, param_target_file_path):
"""
策略F1的文件内容解析
:param param_key_content: 文件路径
:param param_target_file_path 目标文件
:return:
"""
load_strategy = param_key_content["load_strategy"]
target_table = param_key_content["target_table"]
target_table_desc = param_key_content["target_table_desc"]
src_table = param_key_content["src_table"]
param_file_path = param_key_content["file_path"]
target_table_key = param_key_content["target_table_key"]
party_key = param_key_content["party_key"]
# 分区字段数据清洗
src_db = None
# debug info
print("\n******************************** 文件关键信息(开始) *************************************")
print("文件路径:", param_file_path)
print("目标英文表名:", target_table)
print("目标中文表名:", target_table_desc)
print("目标表主键:", target_table_key)
print("源表名:", src_table)
print("加载策略:", load_strategy)
print("********************************* 文件关键信息(结束) ************************************\n")
# 表的对应关系(不分 group 的情况下)
with open(param_file_path, 'r', encoding='utf-8') as inner_file:
# 原始内容
line_content = inner_file.readlines()
# group 分组计数(最多检测10组)
group_count = 0
for index in range(10):
if "Group {0}".format(index) in ''.join(line_content):
group_count += 1
print("分组数量:{0}".format(group_count))
# 注释信息
sql_annotate = "-- ************************************** Base Info ************************************** \n"
sql_annotate += "-- Target Table English Name:{0} \n".format(target_table)
sql_annotate += "-- Target Table Chinese Name:{0} \n".format(target_table_desc)
sql_annotate += "-- Create Date:{0} \n".format(datetime.now())
# 待写入的语句信息
sql_all_write = ""
# sql 头部内容
sql_content_pre = """
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode = 1000;
"""
# 分组数大于等于1时
if group_count > 0:
for index in range(group_count):
# sql 头部内容
'''sql_content_pre = """
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode = 1000;
insert overwrite table data_lake.{0}
select
""".format(target_table)'''
sql_content_join = """\nfrom """
# 组内容拆分
index += 1
group_name = "Group {0}".format(index)
end_name = "修改记录"
if index < group_count:
end_name = "Group {0}".format(index + 1)
print("开始处理: {0}".format(group_name))
start_index = 0
end_index = 0
current_index = 0
for line in line_content:
line_split = line.split(',')
if line_split[0] == group_name:
start_index = current_index
if line_split[0] == end_name:
end_index = current_index
current_index += 1
effect_lines = line_content[start_index:end_index]
# 源表名
src_table = effect_lines[1].split(',')[1]
print("源表名:{0}".format(src_table))
# 组内 sql info
'''if index <= 1:
if party_key.strip() == '':
sql_content_pre = "insert overwrite table data_lake.{0} \n" \
"select \n".format(target_table)
else:
sql_content_pre = "insert overwrite table data_lake.{0} \n" \
"partition({1}) \n"\
"select \n".format(target_table, party_key)
else:
sql_content_pre = "union all\n" \
"select \n".format(target_table)'''
if party_key.strip() == '':
sql_content_pre = "set hive.exec.dynamic.partition=true;\n"\
"set hive.exec.dynamic.partition.mode=nonstrict;\n"\
"set hive.exec.max.dynamic.partitions.pernode = 1000;\n"\
"insert overwrite table data_lake.{0} \n" \
"select \n".format(target_table)
else:
sql_content_pre = "set hive.exec.dynamic.partition=true;\n"\
"set hive.exec.dynamic.partition.mode=nonstrict;\n"\
"set hive.exec.max.dynamic.partitions.pernode = 1000;\n"\
"insert overwrite table data_lake.{0} \n" \
"partition({1}) \n"\
"select \n".format(target_table, party_key)
# 有效的数据内容
group_lines = effect_lines[4:]
join_index = 0
for line in group_lines:
if line.split(',')[0] == "关联条件":
break
join_index += 1
group_effect_lines = group_lines[:join_index]
end_index = 0
for line in group_lines:
if line.split(',')[0] == "条件语句(Where / Group By / Having)":
break
end_index += 1
connect_lines = group_lines[join_index + 2:end_index]
# 组内容(调试)
print(group_effect_lines)
print(connect_lines)
# 注释拼接
sql_content_pre = "-- ************************************************************************** \n" \
+ sql_content_pre
sql_content_pre = "-- Group {0}: {1} \n".format(index, target_table.replace('\n', '')) \
+ sql_content_pre
sql_content_pre = "\n-- ************************************************************************** \n" \
+ sql_content_pre
# 有效语句拼接
for line in group_effect_lines:
line_split = line.split(',')
# 源库
if not src_db:
src_db = 'sdata_full'
# 内容拼接
if line_split[0] == "" or line_split[1] == "":
reflect_data = line_split[7].replace('\n', '')
if reflect_data == "":
reflect_data = "\'\'"
if ('-' not in reflect_data) and (' ' not in reflect_data) and reflect_data[-1] == '\'' and \
reflect_data[0] != '\'':
reflect_data = '\'' + reflect_data
if ("decimal" in line_split[6]) and ('@' in line_split[6]):
line_split[6] = "decimal(38,10)"
if 'etl_dt' in line_split[4].lower():
reflect_data = 'current_timestamp()'
if 'date_format' in reflect_data.lower():
reflect_data = reflect_data.replace('"','')
reflect_data = reflect_data.replace('@',',')
reflect_data = "cast({0} as {1})".format(reflect_data, line_split[6])
'''if reflect_data == "${TX_DATE}":
line_split[6] = line_split[6].replace('@',',')
line_split[6] = line_split[6].replace('"','')
reflect_data = "cast('{0}' as {1})".format(reflect_data, line_split[6])'''
sql_content_pre += " {0} as {1},\n".format(reflect_data, line_split[4])
else:
if 'case' in line_split[7]:
if line_split[7][0] == "\"":
tmp_key_list = list(line_split[7])
tmp_key_list[line_split[7].rfind('"')] = ''
tmp_key_list[line_split[7].find('"')] = ''
tmp_key = ''.join(tmp_key_list)
tmp_key = tmp_key.replace('\n', ' ')
else:
tmp_key = line_split[7].replace('\n', ' ')
tmp_key = tmp_key.replace(u"’", "\'")
if ' end' in tmp_key:
tmp_key = "({0})".format(tmp_key)
else:
tmp_key = "({0} end )".format(tmp_key)
tmp_key = tmp_key.replace('when ', ' when ')
tmp_key = tmp_key.replace('then ', ' then ')
tmp_key = tmp_key.replace('else ', ' else ')
tmp_key = tmp_key.replace(' ', ' ')
elif 'date_format' in line_split[7].lower():
tmp_key = line_split[7].replace('"',''
python自动生成hive sql脚本
最新推荐文章于 2021-09-27 21:06:07 发布
本文介绍如何利用Python编程动态创建和执行Hive SQL脚本,从而提高数据处理的工作效率。通过构建函数和模板,我们可以自定义SQL查询,实现对大规模数据集的自动化分析。
摘要由CSDN通过智能技术生成