本文介绍自动化全量抽取关系新数据库(oracle,mysql,postgresql)到hive/impala。
##全量抽数: 需要五个参数 参数1:全量标示 参数2: 源数据库用户名 参数3:hive表名(要包含源表名)
## sh sqoop_import_source_data_by_tables_cover.sh cover db_name tab_name
#获取传参
cover_or_add=`echo $1 | tr 'A-Z' 'a-z'` ##第一个参数字母大写转成小写
#业务系统简称(与source_db_tns.conf里的一致)
source_sys=`echo $2 | tr '[A-Z]' '[a-z]'`
#hive表名
hive_table_name="$3"
#The first args must be 'cover'
if [ "${cover_or_add}" = "cover" ] ; then
where_condition="where 1=1"
if [ ! $# -eq 3 ] ; then
echo -e "\n[ERROR] You must input 3 args for increment ETL! [cover]"
echo " Format:sh $0 [cover] [database] [target_table_name] "
echo " Example:sh $0 cover test dwr_product "
exit 1
fi
else
echo -e "\n[ERROR] The first args must be 'cover' !"
exit 1
fi
# 判断hive table_name 是否合法
where_tns_conf='/RPT/etlscript/sqoop/conf/source_db_tns.conf'
if [ ! ${hive_table_name:0:4} = "dwr_" ] ; then
echo -e "\n[ERROR] The target table name(the 2nd args) must start with \"dwr_\""
exit 1
else
count_=`echo ${hive_table_name} | awk -F"_" '{print NF}'`
if [ "${count_}" -lt "2" ] ; then
echo -e "\n[ERROR] The target table name must like \"dwr_xxx\"."
exit 1
fi
fi
#hive表去掉dwr_,即业务系统表名称
src_table_name=${hive_table_name#*`echo ${hive_table_name} | awk -F"_" '{print$1"_"}'`}
#判断source_db_tns.conf里对应的TNS配置内容是否存在,且必须唯一
count_tns=`grep "^"${source_sys}"==" ${where_tns_conf} | wc -l`
if [ "${count_tns}" -eq "0" ] ; then
echo -e "\n[ERROR] The TNS of \"${source_sys}\" is not in \"${where_tns_conf}\" , please cover."
exit 1
else
if [ "${count_tns}" -gt "1" ] ; then
echo -e "\n[ERROR] There is more than one TNS of \"${source_sys}\" , please check \"${where_tns_conf}\" . "
exit 1
fi
fi
#获取TNS配置
connect_db_tns=`cat ${where_tns_conf} | grep ${source_sys}"==" | awk -F '==' '{print$2}'`
count_maohao=`echo ${connect_db_tns} | awk -F":" '{print NF}'`
if [ ! ${connect_db_tns:0:5} = "jdbc:" -o "${count_maohao}" -lt "3" ] ; then
echo -e "\n[ERROR] The TNS of \"${source_sys}\" is not correct , please check \"${where_tns_conf}\" . "
exit 1
fi
#获取表栏位类型 (目前适用于oracle\mysql\postgresql,可扩展)
type_tns=`echo ${connect_db_tns} | awk -F":" '{print$2}'|tr '[A-Z]' '[a-z]'`
case ${type_tns} in
oracle )
load_data_start_time="to_char(sysdate,'yyyy-mm-dd hh24:mi:ss') "
data_type_sql="select lower(table_name) as table_name, nvl2(translate(lower(column_name),'\12