前提:分区字段统一为ds,适用于常规增量抽取,通过创建和更新时间抽取。
#!/bin/bash
# 适用于常规增量抽取,通过创建和更新时间抽取
# 手工填写以下参数:
# 分区表达式,hive语法:pt_expr
# 主键字段,联合主键写id1,id2,id3:pk
# 非分区表增量数据量阈值,分区表不用管:inc_cnt
set -e
timer_start=`date "+%Y-%m-%d %H:%M:%S"`
timediff(){
duration=$(($(date +%s -d "${3} ${4}") - $(date +%s -d "${1} ${2}")))
echo "开始时间:${1} ${2}"
echo "结束时间:${3} ${4}"
if [[ $duration -lt 60 ]]
then echo "执行耗时:${duration}s"
elif [[ $duration -lt $((60*60)) ]]
then m=$(($duration/60))
s=$(($duration%60))
echo "执行耗时:${m}m${s}s"
else h=$(($duration/60/60))
m=$(($(($duration%3600))/60))
s=$(($duration%60))
echo "执行耗时:${h}h${m}m${s}s"
fi
}
file_name=`basename $0`
table_name=${file_name%.sh}
tmp_name=${table_name/#ods_/tmp_}
json_name=${table_name}.json
# sql_name=${file_name%.sh}.sql
dir_path=`dirname $0`
# 分区表达式,hive语法
pt_expr="substr(issuedate,1,10)"
# 主键字段,联合主键写id1,id2,id3
pk='id'
# 非分区表增量数据量阈值,分区表不用管
inc_cnt=0
cd $dir_path
# 1、建临时表装增量数据;
echo "判断是否分区,定增量阈值"
is_pt=`hive -e "desc bigdata_ods.${table_name}"|grep -e '^# Partition Information'|wc -l`
if [[ $is_pt == 0 ]]; then
#statements
thr_cnt=$inc_cnt
echo "增量阈值:$thr_cnt"
echo "创建临时表:drop table if exists bigdata_tmp.${tmp_name};create table bigdata_tmp.${tmp_name} like bigdata_ods.${table_name};alter table bigdata_tmp.${tmp_name} set SERDEPROPERTIES('field.delim'='\001')"
hive -e "drop table if exists bigdata_tmp.${tmp_name};create table bigdata_t