#!/bin/bash
cd $(cd $(dirname $0); pwd)
set -e
source ~/env.sh
runday=$(date -d "-1 days" +"%Y-%m-%d")
table_name=""
check_partition=""
check_partition2=""
table_name_tmp=$1
check_partition_tmp=$2
check_partition_tmp2=$3
#获取要检查的表名
if [[ "${table_name_tmp:0:2}" == "-t" ]]; then
table_name=${table_name_tmp:2:99}
echo "${table_name}"
else
echo "=============请按正确格式传入表名(-t加表名)==============="
exit 0
fi
echo "==========要检查的表名为${table_name}"
#判断是否传入开始分区,获取分区条件
if [[ "${check_partition_tmp:0:3}" == "-ds" ]]; then
check_partition=${check_partition_tmp:3:99}
echo "${check_partition}"
else
echo "=========请按正确格式传入开始分区条件(-ds加分区条件,不可以有空格。如果不是分区表只传-ds)==========="
exit 0
fi
#判断是否传入结束分区,获取分区条件
if [[ "${check_partition_tmp2:0:3}" == "-de" ]]; then
check_partition2=${check_partition_tmp2:3:99}
echo "${check_partition2}"
else
echo "=========请按正确格式传入结束分区条件(-de加分区条件,不可以有空格。如果不是分区表只传-de)==========="
exit 0
fi
#查看表名是否存在
echo "==============检查该表是否存在======================"
hive -e "select * from ${table_name} limit 1;"
echo "--------------组合检查列----------------------------"
#定义变量,组合为检查列集合
column_list=""
index=1
#遍历所有参数
for arg in $*
do
if [[ ${index} > 3 ]];then
column_list="${column_list},$arg"
fi
let index+=1
done
column_list=${column_list:1}
echo "检查列组合完成${column_list}"
if [[ ${column_list} == "" ]];then
echo "=========该次检查参考列为所有列=============="
touch tmp_column.txt tmp_column2.txt
hive -e "desc ${table_name}" >> ./tmp_column.txt
awk '{print $1}' ./tmp_column.txt >> ./tmp_column2.txt
rm -rf ./tmp_column.txt
column_list=`cat tmp_column2.txt | tr '\n' ','`
column_list=${column_list%,*}
column_list=${column_list%,,#,#*}
rm -rf ./tmp_column2.txt
echo "=========所有列为${column_list}=============="
fi
#判断该表是否为分区表
echo "==============判断该表是否为分区表================="
create_table_word=`hive -e "show create table ${table_name};"`
par="PARTITIONED BY"
if [[ ${create_table_word} == *$par* ]];then
echo "该表为分区表"
echo "============开始检查数据================="
#检查分区表sql
check_partition_sql=""
if [[ ${check_partition2} == "" ]];then
check_partition_sql="
select count(*) from (
select ${column_list},count(*) cnt from ${table_name} where ${check_partition} group by ${column_list} having cnt > 1
)aa
"
else
check_partition_sql="
select count(*) from (
select ${column_list},count(*) cnt from ${table_name} where ${check_partition} and ${check_partition}2 group by ${column_list} having cnt > 1
)aa
"
fi
check_partition_sql_result=`hive -e "${check_partition_sql}"`
if [[ ${check_partition_sql_result} == 0 ]];then
echo "检查结果:无重复数据"
else
echo "检查结果:有重复数据"
fi
else
echo "该表为非分区表"
echo "============开始检查数据================="
#检查非分区表的sql
check_no_partition_sql="
select count(*) from (
select ${column_list},count(*) cnt from ${table_name} where 1=1 group by ${column_list} having cnt > 1)aa
"
check_no_partition_sql_result=`hive -e "${check_no_partition_sql}"`
if [[ ${check_no_partition_sql_result} == 0 ]];then
echo "检查结果:无重复数据"
else
echo "检查结果:有重复数据"
fi
fi
echo "=======脚本执行完成======="