本文介绍自动化按照指定时间段增量抽取关系型数据库(oracle,mysql,postgresql)到hive/impala。
##增量抽数: 需要五个参数 参数1:增量标示 参数2: 源数据库用户名 参数3:hive表名(dwr_源表名) 参数4:where条件 参数5:开始时间 参数6:结束时间
## sh /RPT/etlscript/sqoop/shell/sqoop_import_source_data_by_tables_add.sh add p3fdcadm dwr_EQP_CONTEXT_PARAM_MST_PP "where start_dtts>=to_date('LVS_START_TIMEKEY','yyyy/mm/dd hh24:mi:ss') and start_dtts<to_date('LVS_END_TIMEKEY','yyyy/mm/dd hh24:mi:ss')" 20210721073000 20210721083000
#获取传参
add_or_cover=`echo $1 | tr 'A-Z' 'a-z'` ##第一个参数字母大写转成小写
#业务系统简称(与source_db_tns.conf里的一致)
source_sys=`echo $2 | tr '[A-Z]' '[a-z]'`
#hive表名
hive_table_name="$3"
let starttimekeyNo=$#-1
let endtimekeyNo=$#
eval start_timekey=\$${starttimekeyNo}
eval end_timekey=\$${endtimekeyNo}
#The first args must be 'add'
if [ "${add_or_cover}" = "add" ] ; then
where_condition=`echo $4|sed -e "s/LVS_START_TIMEKEY/${start_timekey}/g" -e "s/LVS_END_TIMEKEY/${end_timekey}/g"`
if [ ! $# -eq 6 ] ; then
echo -e "\n[ERROR] You must input 6 args for increment ETL! [add]"
echo " Format:sh $0 [add] [database] [target_table_name] [\"where_condition\"] [start_timekey] [end_timekey]"
echo " Example:sh $0 add mest3 dwr_producthistory \"where timekey >= 'LVS_START_TIMEKEY' and timekey < 'LVS_END_TIMEKEY' \" 20200101073000 20200101083000"
exit 1
fi
#分区值
partition_value_date=${start_timekey:0:8}
partition_value_hour=${start_timekey:0:12}
else
echo -e "\n[ERROR] The first args must be 'add' !"
exit 1
fi
# 判断hive table_name 是否合法 1:以dwr_开头;2:必须含有两个下划线
where_tns_conf='/RPT/etlscript/sqoop/conf/source_db_tns.conf'
if [ ! ${hive_table_name:0:4} = "dwr_" ] ; then
echo -e "\n[ERROR] The target table name(the 2nd args) must start with \"dwr_\""
exit 1
else
count_=`echo ${hive_table_name} | awk -F"_" '{print NF}'`
if [ "${count_}" -lt "2" ] ; then
echo -e "\n[ERROR] The target table name must like \"dwr_xxx\"."
exit 1
fi
fi
#hive表去掉dwr_,即业务系统表名称
src_table_name=${hive_table_name#*`echo ${hive_table_name} | awk -F"_" '{print$1"_"}'`}
#判断source_db_tns.conf里对应的TNS配置内容是否存在,且必须唯一
count_tns=`grep "^"${source_sys}"==" ${where_tns_conf} | wc -l`
if [ "${count_tns}" -eq "0" ] ; then
echo -e "\n[ERROR] The TNS of \"${source_sys}\" is not in \"${where_tns_conf}\" , please add."
exit 1
else
if [ "${count_tns}" -gt "1" ] ; then
echo -e "\n[ERROR] There is more than one TNS of \"${source_sys}\" , please check \"${where_tns_conf}\" . "
exit 1
fi
fi
#获取TNS配置
connect_db_tns=`cat ${where_tns_conf} | grep ${source_sys}"==" | awk -F '==' '{print$2}'`
count_maohao=`echo ${connect_db_tns} | awk -F":" '{print NF}'`
if [ ! ${connect_db_tns:0:5} = "jdbc:" -o "${count_maohao}" -lt "3" ] ; then
echo -e "\n[ERROR] The TNS of \"${source_sys}\" is not correct , please check \