分享一个几年前写的hive的话单比对的sql
需求
实现
- 创建外部表
- 读取目录的层级关系,修改外部表(体现hive表的分区属性)
- 执行话单比对
- 结果保存到hive表
操作人员:
- 配置两边的路径
- 配置构成主键的字段、要比对的字段
这个可能就是以配置文件的形式提供,你的linux shell要读取这个配置文件
//========================================================================
source_hdfs=/apps/prep/normal/archive/normal/591/
dest_hdfs=/user/bill/
comapared_words="EVENT_TYPE_ID,TICKET_ID"
#自定义对比日期
#如果要设置对比日期为当天的话:TODAY=$(date +%Y/%m/%d)
TODAY=2016/10/10
#例如$TODAY-3~$TODAY$-1 $TODAY$是23号那边就是比22,21,20
#$TODAY-start~$TODAY-end
start=3
end=0
//=========================================================================
start_shell_time=$(date +%s)
echo "---------开始脚本的时间为"$start_shell_time
echo "---------加载配置--------------------------------"
! . /home/hive/shill_hive/hql.conf && echo "hql.conf does not exist!"&& return
echo "---------开始话单比对脚本------------------------"
echo "---------今天日期为:"$TODAY"---------"
echo $TODAY
echo $start
echo $end
end2=`expr $end - 1`
start_date=`date -d "$start day ago $TODAY" +%Y/%m/%d`
end_date=`date -d "$end day ago $TODAY" +%Y/%m/%d`
echo "---------对比时间为:"$start_date"~"$end_date"---------------"
first=`date -d "$start day ago $TODAY" +%Y%m%d`
second=`date -d "$end2 day ago $TODAY" +%Y%m%d`
#将所有的时间保存到数组date_array中
n=0
while [ "$first" != "$second" ]
do
date_array[$n]=`date -d "$first" +%Y/%m/%d`
echo `date -d "$first" +%Y/%m/%d`
let n=`expr $n + 1`
let first=`date -d "-1 days ago ${first}" +%Y%m%d`
done
echo "---------准备创建hive外部表source与dest"
drop_source_table="DROP TABLE IF EXISTS source;"
drop_dest_table="DROP TABLE IF EXISTS dest;"
create_source_table="
CREATE EXTERNAL TABLE source (
json string
)
PARTITIONED BY(month string,date string,par string)
ROW FORMAT DELIMITED
STORED AS textfile
LOCATION '$source_hdfs';"
create_dest_table="
CREATE EXTERNAL TABLE dest (
json string
)
PARTITIONED BY(month string,date string,par string)
ROW FORMAT DELIMITED
STORED AS textfile
LOCATION '$dest_hdfs';"
/usr/hdp/2.2.0.0-2041/hive/bin/hive -e "$drop_source_table"
/usr/hdp/2.2.0.0-2041/hive/bin/hive -e "$create_source_table"
/usr/hdp/2.2.0.0-2041/hive/bin/hive -e "$drop_dest_table"
/usr/hdp/2.2.0.0-2041/hive/bin/hive -e "$create_dest_table"
echo "---------hive外部表创建成功---------------"
#获得所有的时间 然后分区location 2016/10/10 2016(0) 10(1) 10(2)
source_partition_sql="alter table source add"
dest_partition_sql="alter table dest add"
index=0
while (( $index<${#date_array[@]} ))
do
zz=${date_array[$index]}
year_mon_day_arr=(${zz//// })
year=${year_mon_day_arr[0]}
month=${year_mon_day_arr[1]}
day=${year_mon_day_arr[2]}
for j in 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30; do
source_partition_sql="$source_partition_sql partition (month='$year$month', date='$day',par=$j) location '$year$month/$day/$j'"
done
for z in 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30; do
dest_partition_sql="$dest_partition_sql partition (month='$year$month', date='$day',par=$z) location '$year$month/$day/$z'"
done
let "index++"
done
#看看分区语句是否正确
#echo $source_partition_sql
#echo $dest_partition_sql
#支持递归子目录
#set hive.mapred.supports.subdirectories=true
echo "---------准备导入分区数据"
/usr/hdp/2.2.0.0-2041/hive/bin/hive -e "$source_partition_sql"
/usr/hdp/2.2.0.0-2041/hive/bin/hive -e "$dest_partition_sql"
echo "---------分区数据导入成功"
#comapared_words="EVENT_TYPE_ID,TICKET_ID"
arr=(${comapared_words//,/ })
index=0
while (( $index<${#arr[@]} ))
do
pre_word+=",'${arr[index]}'"
source_word+=",${arr[index]}1"
dest_word+=",${arr[index]}2"
[[ $index != 0 ]] && tab_comp_sql+=" or "
tab_comp_sql+="tab1.${arr[index]}1<>tab2.${arr[index]}2"
let "index++"
done
#echo $pre_word
#echo $source_word
#echo $dest_word
#echo $tab_comp_sql
# start_year=`date -d "$start_date" +%Y`
# end_year=`date -d "$end_date" +%Y`
#start_month=`date -d "$start_date" +%m`
# end_month=`date -d "$end_date" +%m`
# start_day=`date -d "$start_date" +%d`
# end_day=`date -d "$end_date" +%d`
compare_sql="
CREATE TABLE result as
select tab1.SERV_ID1, tab1.SOURCE_EVENT_TYPE_ID1, tab1.ORG_CDR_ID1, tab1.COLLECT_CDR_ID1, tab2.SERV_ID2, tab2.SOURCE_EVENT_TYPE_ID2, tab2.ORG_CDR_ID2, tab2.COLLECT_CDR_ID2 from
(select b.* from source a lateral view json_tuple(a.json, 'SERV_ID', 'SOURCE_EVENT_TYPE_ID', 'ORG_CDR_ID', 'COLLECT_CDR_ID'$pre_word) b as SERV_ID1, SOURCE_EVENT_TYPE_ID1, ORG_CDR_ID1, COLLECT_CDR_ID1 $source_word ) tab1
full outer join
(select b.* from dest a lateral view json_tuple(a.json, 'SERV_ID', 'SOURCE_EVENT_TYPE_ID', 'ORG_CDR_ID', 'COLLECT_CDR_ID' $pre_word) b as SERV_ID2, SOURCE_EVENT_TYPE_ID2, ORG_CDR_ID2, COLLECT_CDR_ID2 $dest_word ) tab2
on (tab1.SERV_ID1=tab2.SERV_ID2 and tab1.SOURCE_EVENT_TYPE_ID1=tab2.SOURCE_EVENT_TYPE_ID2 and tab1.ORG_CDR_ID1=tab2.ORG_CDR_ID2 and tab1.COLLECT_CDR_ID1=tab2.COLLECT_CDR_ID2)
where (tab1.SERV_ID1=tab2.SERV_ID2 and tab1.SOURCE_EVENT_TYPE_ID1=tab2.SOURCE_EVENT_TYPE_ID2 and tab1.ORG_CDR_ID1=tab2.ORG_CDR_ID2 and tab1.COLLECT_CDR_ID1=tab2.COLLECT_CDR_ID2 and ($tab_comp_sql))
or (tab1.SERV_ID1 is null and tab1.SOURCE_EVENT_TYPE_ID1 is null and tab1.ORG_CDR_ID1 is null and tab1.COLLECT_CDR_ID1 is null and tab2.SERV_ID2 is not null and tab2.SOURCE_EVENT_TYPE_ID2 is not null and tab2.ORG_CDR_ID2 is not null and tab2.COLLECT_CDR_ID2 is not null)
or (tab1.SERV_ID1 is not null and tab1.SOURCE_EVENT_TYPE_ID1 is not null and tab1.ORG_CDR_ID1 is not null and tab1.COLLECT_CDR_ID1 is not null and tab2.SERV_ID2 is null and tab2.SOURCE_EVENT_TYPE_ID2 is null and tab2.ORG_CDR_ID2 is null and tab2.COLLECT_CDR_ID2 is null);
"
echo $compare_sql
echo "---------开始比对数据"
drop_result_table="DROP TABLE IF EXISTS result;"
/usr/hdp/2.2.0.0-2041/hive/bin/hive -e "$drop_result_table"
/usr/hdp/2.2.0.0-2041/hive/bin/hive -e "$compare_sql"
echo "---------比对数据完成"
end_shell_time=$(date +%s) &&
echo "---------脚本结束的时间为"$end_shell_time