场景描述
- 使用spark对数据进行处理,处理完成后,要求按用户给定的excel模板将数据导给用户
- 模板中的部分tab所需数据不是spark计算的,但是这部分tab需要在最后的输出中将tab和列名展示出来
- 模板中的部分列在计算中是没有的,这些列也需要输出,数据留着空白就可以
- excel的输出要求:每个tab严格按照模板的顺序,数据中的列严格按照模板的顺序,每一行按规定的字段排序
实现思路
- spark的输出对照模板tab,每个tab一个csv文件,写到 hdfs 指定文件目录下
- 将每个tab的列名生成一个文件放到hdfs的目录下,共后续按顺序来获取列
- 提供公共方法来写最后的csv,将csv的字段按照模板的格式输出(包含该tab页面的所有列,并按模板列的顺序)
- 提供公共方法来写没有数据,只需要写列名的tab
- 将 hdfs 下的每个tabmerge到本地
- 生成一个模板tab的列表,顺序按tab的顺序,使用pandas将csv读取出来,并按要求对数据进行排序,最后写到excel中
代码实现
model_tab_map = {
'tab1': 'tab1标题',
'tab2': 'tab2标题'
}
def final_save_csv_file(spark, tab_result_path, tab_dataframe, csv_file_name):
"""
保存最后结果到csv文件
:param spark
:param tab_result_path : 存放模板csv的目录
:param tab_dataframe: spark计算得到的数据
:param csv_file_name:
:return:
"""
str_ordered_column_rows = spark.read.text("{}column_files/{}".format(tab_result_path, csv_file_name)).collect()
str_ordered_column = str_ordered_column_rows[0]
columns = [column.strip() for column in str_ordered_column.value.split('\t')]
exists_schema = [it.name for it in index_all.schema]
lst_select = []
for i in columns:
if i not in exists_schema:
lst_select.append("""'' as `{}`""".format(i))
else:
lst_select.append('`{}` as `{}`'.format(i, i))
tab_v_name = 'final_tab_res_{}_v'.format(csv_file_name)
tab_dataframe.createOrReplaceTempView(tab_v_name )
spark.sql(
"""
select {}
from {}
""".format(','.join(lst_select), index_v_name)
)\
.repartition(1)\
.write\
.csv(tab_result_path+ 'final_tab/{}'.format(csv_file_name), header=True, sep='\t', mode='overwrite')
def save_empty_csv(tab_result_path, csv_file_name):
"""
没有数据的板块,将板块的列名写到csv中,并输出到对应文件夹下面
:param tab_result_path:
:param csv_file_name:
:return:
"""
column_file_path = os.path.join(tab_result_path, 'column_files', csv_file_name)
csv_tab_path = os.path.join(tab_result_path, 'final_tab', csv_file_name)
result = os.system('hdfs dfs -cp {} {}'.format(column_file_path, csv_tab_path ))
if result != 0:
print('copy excel head failed:{}!'.format(csv_file_name))
def merge_csv_to_excel(tab_result_path, local_result_path):
"""
将csv数据合并为excel
:tab_result_path:
:local_result_path: 本地存放目录
:return:
"""
import pandas as pd
str_now = '2020-01-01'
local_result_path_time = os.path.join(local_result_path, str_now)
tab_result_path_final = os.path.join(tab_result_path, 'final_tab')
if not os.path.exists(local_result_path_time):
os.mkdir(local_result_path_time)
else:
for root, dirs, files in os.walk(local_result_path_time, topdown=False):
for name in files:
os.remove(os.path.join(root, name))
for name in dirs:
os.rmdir(os.path.join(root, name))
result = os.system('hdfs dfs -get {} {}'.format(tab_result_path_final, local_result_path_time))
if result != 0:
print('get file from hadoop failed!')
return
excel_tab_names = OrderedDict(model_tab_map)
result_excel = os.path.join(local_result_path, 'final_result', 'final_result_{}.xlsx'.format(str_now))
writer = pd.ExcelWriter(result_excel)
local_result_path_time = os.path.join(local_result_path_time, 'final_tab')
try:
for tab_name in excel_tab_names:
file_path = os.path.join(local_result_path_time, tab_name)
if not os.path.exists(file_path):
print('file:{} not exists!'.format(file_path))
continue
if os.path.isfile(file_path):
csv = pd.read_csv(file_path, sep='\t')
csv.sort_values(by=['col1', 'col2'], inplace=True)
csv.to_excel(writer, sheet_name=model_tab_map[tab_name], index=False, header=True)
elif os.path.isdir(file_path):
for f in os.listdir(file_path):
if '_SUCCESS' in f:
continue
csv = pd.read_csv(os.path.join(file_path, f), sep='\t')
csv.sort_values(by=['col1', 'col2'], inplace=True)
csv.to_excel(writer, sheet_name=model_tab_map[tab_name], index=False, header=True)
break
print('write {} to excel success'.format(model_tab_map[tab_name]))
except:
print('merge to excel, some error happened:{}'.format(traceback.format_exc()))
finally:
writer.close()