场 景
技术背景:使用hadoop集群搭建数仓,前端业务DB,mysql,mssql居多,构建数据仓库ods层需要将数据先集成到hdfs内,又被parquet文件的有序格式深深吸引,说白了就是想要实现将前端业务DB的数据以parquet文件的形式unloa到hdfs内,穷的又买不起强大的ETL工具,kettle屡屡碰壁后,就只能回到最原始的shell脚本来最简单的实现该功能,毕竟相对于Java、Python少了导包操作还很开心的;
shell脚本实现逻辑模板unload_to_hdfs.sh
#!/usr/bin/env bash
#unload_to_hdfs.sh
mydate=${1}
mydate=$(date -d"1 day ago ${mydate}" +%Y%m%d)
eventweek=$(date -d ${mydate} +%V)
eventweek=$((10#$eventweek)) #convert enentweek to Int type
host=$2
db_file=$3
user=$4
passwd=$5
target_hdfs=$6/$mydate
target_ods=$7
target_ods_table=$8
db=`cat ${db_file} | tr "\n" " " | tr "\r" " "`
echo "mydate: $mydate"
echo "eventweek: $eventweek"
echo "host: $host"
echo "db: $db"
echo "table: $table"
echo "target_hdfs: $target_hdfs"
echo "target_ods: $target_ods"
echo "target_ods_table: $target_ods_table"
hadoop fs -test -e $target_hdfs
hdfs_flag=$?
echo "status:$hdfs_flag"
if [ $hdfs_flag -eq 0 ];then
echo "hdfs file_data is exist!"
hadoop fs -rm -r $target_hdfs && echo "sc.getConf.set(\"spark.debug.maxToStringFields\", \"500\");spark.read.format(\"jdbc\").option(\"url\",\"${host}\" ).option(\"user\", \"${user}\").option(\"password\",\"${passwd}\").option(\"dbtable\", \"${db}\").load().write.parquet(\"${target_hdfs}\")" > operation && cat operation | spark-shell --master yarn --deploy-mode client --executor-memory 2G --executor-cores 1 --num-executors 4 --queue etl
else
echo "hdfs file_data is not exist" && echo "sc.getConf.set(\"spark.debug.maxToStringFields\", \"500\");spark.read.format(\"jdbc\").option(\"url\",\"${host}\" ).option(\"user\", \"${user}\").option(\"password\",\"${passwd}\").option(\"dbtable\", \"${db}\").load().write.parquet(\"${target_hdfs}\")" > operation && cat operation | spark-shell --master yarn --deploy-mode client --executor-memory 2G --executor-cores 1 --num-executors 4 --queue etl
fi
## echo "spark.read.format(\"jdbc\").option(\"url\",\"${host}\" ).option(\"user\", \"${user}\").option(\"password\",\"${passwd}\").option(\"dbtable\", \"${db}\").load().write.parquet(\"${target_hdfs}\")" > operation && cat operation | spark-shell --master yarn --deploy-mode client --executor-memory 2G --executor-cores 1 --num-executors 4
hdfs_path=$target_hdfs/_SUCCESS
echo "hdfs_path: $hdfs_path"
hadoop fs -test -e $hdfs_path
hdfs_success_flag=$?
if [[ $hdfs_success_flag -eq 0 ]];then
echo "spark job unload to hdfs run successfully"
echo "spark job load to ods start"
echo "load data inpath '${target_hdfs}/*' overwrite into table dw.${target_ods_table} partition(event_week=${eventweek},event_day='${mydate}',event_hour='00');" | hive
ods_path=$target_ods/event_week=${eventweek}/event_day=${mydate}/event_hour=00/_SUCCESS
echo "ods_path: $ods_path"
hadoop fs -test -e $ods_path
ods_success_flag=$?
if [[ $ods_success_flag -eq 0 ]];then
echo "spark job load to ods run successfully"
else
echo "spark job load to ods run failed"
exit 7
fi
else
echo "spark job unload to hdfs run failed"
exit 7
fi
调用Shell
bash unload_to_hdfs.sh 20191013 "jdbc:sqlserver://10.218.7.10:1433;DatabaseName=DB_Hello" sqlserver.sql mylogin 7758521 /source/sqlserver/
参数说明
bash unload_to_hdfs.sh 参数1 参数2 参数3 参数4 参数5 参数6
1. 参数1:日期"YYYYMMDD"
2. 参数2:ulr sqlserver:"jdbc:sqlserver://10.218.7.10:1433;DatabaseName=DB_Hello"
mysql "jdbc:mysql://10.217.6.21:3306/mydb"
最好带上"",不然sqlserver的分号会转义
3. 参数3:sql文件路径 "/home/ods_mytable.sql"
sql文件格式1(带select的):(select col1 as mycol1,col2 as mycol2 from table1) as tmp
sql文件格式2(直接抽表):dbo.mytable
统一使用格式1,sql最好语句不要换行,as必须要加;
参数4:数据库登录账号
参数5:数据路登录密码
参数6:hdfs存放目录/source/mytable
配置只需要写到表级别folder,shell脚本后续会自动加上日期和parquet文件
参数进阶——配置表
规范一点的话,就把“ 调用Shell”的这些参数写道一张mysql表内,先把配置表建起来,sql语句如下:
CREATE TABLE `cfg_ods_dataset` (
`id` bigint(20) NOT NULL AUTO_INCREMENT comment'自增id',
`dataset_id` bigint(20) DEFAULT NULL comment'数据集id',
`dataset_name` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL comment'数据集名称',
`source_table` varchar(100) COLLATE utf8mb4_unicode_ci DEFAULT NULL comment'源数据表',
`sql_name` varchar(250) COLLATE utf8mb4_unicode_ci DEFAULT NULL comment'源数据取数逻辑',
`source_ulr` varchar(500) COLLATE utf8mb4_unicode_ci DEFAULT NULL comment'源数据jdbc连接',
`target_file_name` varchar(100) COLLATE utf8mb4_unicode_ci DEFAULT NULL comment'hdfs目标文件名称',
`target_hdfs_path` varchar(200) COLLATE utf8mb4_unicode_ci DEFAULT NULL comment'hdfs目标文件中转路径',
`group_id` bigint(20) DEFAULT NULL comment'数据集分组id',
`sequenceid` int(11) DEFAULT NULL comment'数据集分组内表跑的顺序id',
`is_enable` tinyint(4) DEFAULT NULL comment'是否有效',
`create_time` datetime DEFAULT CURRENT_TIMESTAMP comment'创建时间',
`update_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP comment'更新时间',
`db_user` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL comment'源数据库用户名',
`db_user_pw` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL comment'源数据库密码',
`target_ods_path` varchar(250) COLLATE utf8mb4_unicode_ci DEFAULT NULL comment'hdfs的ods层文件路径',
PRIMARY KEY (`id`),
UNIQUE KEY `uidx_cfg_ods_dataset` (`dataset_id`,`group_id`,`target_file_name`)
) ENGINE=InnoDB AUTO_INCREMENT=186 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
;
然后写入配置表
INSERT INTO dw_config.cfg_ods_dataset
(
dataset_id,
dataset_name,
source_table,
sql_name,
source_ulr,
target_file_name,
target_hdfs_path,
group_id,
sequenceid,
is_enable,
create_time,
update_time,
db_user,
db_user_pw,
target_ods_path
) VALUES
( 1,
'rs_basic',
'basic.tbd_Area',
'ods_rs_basic_tbd_area.sql',
'jdbc:sqlserver://10.256.232.101:1433;DatabaseName=DB_Basic', --mysql jdbc:mysql://10.256.39.12:13300/dw_config?serverTimezone=GMT%2B8&characterEncoding=UTF-8?serverTimezone=GMT%2B8&characterEncoding=UTF-8
'ods_rs_basic_tbd_area', '/source/rs/basic/ods_rs_basic_tbd_area',
3,
1000,
1,
now(),
now(),
'dw_user_reader',
'123456',
'/hive/warehouse/ods/rs/basic/ods_rs_basic_tbd_area'
);
再写个shell调用配置表的参数main_ods_hdfs.sh,去调用unload_to_hdfs.sh
#!/bin/bash
#main_ods_hdfs.sh
#以下是配置表的server信息
hostname="10.252.103.39"
port="3306"
username="dw_user"
password="1234asdf"
database="dw_config"
curr_dir=`pwd`
event_day=$1
sec=`date -d $1 +%s`
sec_daysbefore=$((sec - 86400))
event_yesterday=`date -d @$sec_daysbefore +%Y%m%d`
select_sql="SELECT source_ulr,sql_name,db_user,db_user_pw,target_hdfs_path,target_ods_path FROM cfg_ods_dataset WHERE dataset_id=$2 AND group_id=$3 AND target_file_name='$4';"
# result=`mysql -h${hostname} -P${port} -u${username} -p${password} ${database} -e "${select_sql}" -s`
# echo $select_sql
if [[ ! -n "$1" || ! -n "$2" || ! -n "$3" || ! -n "$4" ]]; then
echo "There are no 4 variables inputed, please input at lease 4 variables!"
exit 1
else
echo "The parameters you input are: $@"
if echo $event_day | grep -Eq "[0-9]{4}[0-9]{2}[0-9]{2}" && date -d $event_day +%Y%m%d > /dev/null 2>&1
then :;
else
echo "输入的是非日期数据值或者日期格式不正确,应为yyyymmdd,比如20111001";
exit 1;
fi;
# get spark variable and execute coresponding shell scripts
while read line
do
dataset_id=$2
group_id=$3
target_file_name=$4
source_ulr=`echo $line | awk '{print $1}'`
sql_name=`echo $line | awk '{print $2}'`
source_sql_path=`echo "${curr_dir}/../src/${sql_name}"`
db_user=`echo $line | awk '{print $3}'`
db_user_pw=`echo $line | awk '{print $4}'`
target_hdfs_path=`echo $line | awk '{print $5}'`
target_ods_path=`echo $line | awk '{print $6}'`
echo "dateset_id:${dataset_id},group_id:${group_id},target_file_name:${target_file_name},source_ulr:${source_ulr},source_sql_path:${source_sql_path},target_hdfs_path:${target_hdfs_path},db_user:${db_user}"
# echo "bash ods_rs_media_tbb_building_info.sh ${event_day} ${source_ulr} ${source_sql_path} ${db_user} ${db_user_pw} ${target_hdfs_path}"
#调用unload_to_hdfs.sh
bash ${curr_dir}/../src/load_data_to_ods.sh ${event_day} ${source_ulr} ${source_sql_path} ${db_user} ${db_user_pw} ${target_hdfs_path} ${target_ods_path} ${target_file_name}
status=$?
if [ $status -eq 0 ]; then
echo "Successful: load data of ${event_yesterday} to ods hive table ${target_file_name} sucessful!!!!"
else
echo "Error: load data of ${event_yesterday} to ods hive table ${target_file_name} failed!!!!"
exit 1
fi
done< <(mysql -h${hostname} -P${port} -u${username} -p${password} ${database} -e "${select_sql}" -s)
fi
这样就能很简单的调用当前的这张表去上集群了
#main_ods_hdfs.sh接参数 日期,dataset_id group_id ods文件名
./main_ods_hdfs.sh 20200319 1 3 ods_rs_assemble_tbb_burst_point
调度
剩下的就是用Azkaban传参数调度即可;