第一次写一个很复杂的Shell为了实现以下功能:
1.查询hive状态表,表中根据request_type类型不同,执行数据refresh或者report逻辑。如果发现有refresh,默认插入2条report数据。
2.如果同时有数据refresh和report,默认一定要refresh执行完,才能跑report.
3.因为这是hive做状态控制,所有之前发现当有前台数据录入此表,后台autosys执行此脚本同时发生时,产生了数据的不一致性。为此修改了逻辑,只会执行一次hive数据查询,即使发生同时插入和同时查询,那就牺牲时间,在autosys下次执行时,才把数据查询得到。曾经考虑过hive分区锁表,但是因为急于上线,有时间研究下怎么设置锁。
脚本如下:
#!/bin/ksh
dir=/data/1/XXX/bin/projects/1.0.0-SNAPSHOT/Configure/
currTime=`date +%Y%m%d%H%M%S`
echo -e "Start this script time: $currTime \n"
. /data/1/XXX/bin/XXX/commonlib.ksh
META_SCHEMA=XXX_META
if [[ "$RPM_ENVIRONMENT" = "DEV" ]] || [[ "$RPM_ENVIRONMENT" = "SIT" ]]
then
echo -e "--- this is ${RPM_ENVIRONMENT} env ----------------------------"
kinit -k -t /opt/Cloudera/keytabs/`whoami`.`hostname -s`.keytab `whoami`/`hostname -f`@XXX.NET
export beeline_str="jdbc:hive2://bigdataplatformXXX.net:10000/default;principal=hive/bigdataplatform-XXX.net@XXX.NET;ssl=true"
export META_PASS='XXX'
export XXX_META_CONN_STR=XXX_META/'"'${META_PASS}'"'@XXX:XXX/XXX
elif [[ "$RPM_ENVIRONMENT" = "UAT" ]]
then
echo -e "--- this is ${RPM_ENVIRONMENT} env ----------------------------"
kinit -k -t /opt/Cloudera/keytabs/`whoami`.`hostname -s`.keytab `whoami`/`hostname -f`@XXX.NET
export beeline_str="jdbc:hive2://bigdataplatformXXX.net:10000/default;principal=hive/bigdataplatform-XXX.net@XXX.NET;ssl=true"
export META_PASS='XXX'
export XXX_META_CONN_STR=XXX_META/'"'${META_PASS}'"'@XXX:XXX/XXX
elif [[ "$RPM_ENVIRONMENT" = "PROD" ]]
then
echo -e "--- this is ${RPM_ENVIRONMENT} env ----------------------------"
kinit -k -t /opt/Cloudera/keytabs/`whoami`.`hostname -s`.keytab `whoami`/`hostname -f`@XXX.NET
export beeline_str="jdbc:hive2://bigdataplatformXXX.net:10000/default;principal=hive/bigdataplatform-XXX.net@XXX.NET;ssl=true"
export META_PASS='XXX'
export XXX_META_CONN_STR=XXX_META/'"'${META_PASS}'"'@XXX:XXX/XXX
fi
function run_seq(){
run_sql "${META_SCHEMA}" "select REPORT_STATUS_FIN_SEQUENCE.nextval || '|' from dual;"
result1=$sql_result_set
export sequenceNum1=$( echo $result1 | cut -d'|' -f1 )
echo -e "sequenceNum1=${sequenceNum1}"
run_sql "${META_SCHEMA}" "select XXX_SEQUENCE.nextval || '|' from dual;"
result2=$sql_result_set
export sequenceNum2=$( echo $result2 | cut -d'|' -f1 )
echo -e "sequenceNum2=${sequenceNum2}"
}
function usage() {
for pid in $(jobs -p)
do
wait $pid
status=$?
echo "## status=${status} ,pid=${pid} "
if [ $status != 0 ];then
echo "## $pid status is $status have some error!" >> $dir/batchTrigger.log
else
echo "## $pid status is $status success!" >> $dir/batchTrigger.log
fi
done
}
function runSomeReport(){
array=$1
start=$2
end=$3
echo "array=$array,start=$start,end=$end"
for((j=$start;j<$end;j++))
do
var1=${array[j]}
echo "### run i = ${j}, value = ${var1}"
runId=$(echo $var1 | cut -d',' -f1)
formName=$(echo $var1 | cut -d',' -f2)
asOfDate=$(echo $var1 | cut -d',' -f3)
requestType=$(echo $var1 | cut -d',' -f4)
param2="#COB#=${asOfDate};#FORM_NAME#=${formName};#RUN_ID#=${runId};#REQUEST_TYPE#=${requestType}"
echo $param2
sh $dir"batch.sh" "my" $param2 $formName &
done
echo "### waiting report finish. i=${j}"
usage
}
function processBatch(){
array=($@)
totalNum=${#array[*]}
echo -e "array=${array},totalNum=${totalNum}"
actualNum=`expr $totalNum - 1`
echo -e "### Need run ${totalNum} reports! actualNum=${actualNum} \n"
step=6
echo -e "### step=$step \n"
loopNum=`expr $totalNum / $step`
echo -e "### loopNum=$loopNum \n"
loopN=`expr $loopNum + 1`
echo "### loopN = ${loopN} "
echo "---------------------------------"
echo -e "\n\n"
for((k=0;k<=$loopNum;k++))
do
start=$(($k*$step))
echo "start=${start}"
tmp=$(($k+1))
echo "tmp=$tmp"
end=$(($tmp*$step))
echo "end=$end"
echo "start=${start}, end=${end}, k=${k}"
if [ $end -gt $actualNum ];
then
echo -e "\nLast start run batchReport...."
runSomeReport $array $start $totalNum
echo -e "Last run batchReport end... \n\n"
else
echo -e "Start run batchReport.... "
runSomeReport $array $start $end
echo -e "Run batchReport end...\n\n"
fi
done
}
arrReport=()
arrDt=()
psID=`ps ax | grep "com.XXX.XXXApplication" | grep -v grep | awk '{print $1}'`
echo " psID = ${psID} "
if [ -z "${psID}" ];
then
echo -e "No process is running now! Can process now...\n"
tableName="HIVE_DB_NAME.HIVE_TBL_NAME"
sqlAll="select id,name,as_of_date,request_type from (select id,name,as_of_date,request_type,concat_ws(',',collect_list(status)) as combine_status from ${tableName} where upper(request_type)='DATAREFRESH' group by id,name,as_of_date,request_type) as tmp where tmp.combine_status='INITIATED' union select id,name,as_of_date,request_type from (select id,name,as_of_date,request_type,concat_ws(',',collect_list(status)) as combine_status from ${tableName} where upper(request_type)='REPORT' group by id,name,as_of_date,request_type) as tmp where tmp.combine_status='INITIATED';"
dataAll=$(beeline -u ${beeline_str} --ShowHeader=false --outputformat=csv2 -e "${sqlAll}")
arrayAll=(${dataAll//\n/ })
totalNum=${#arrayAll[*]}
echo -e "arrayAll=${arrayAll},totalNum=${totalNum} \n"
if [[ $totalNum -eq 0 ]]
then
echo "No record need to do...exit now!"
exit 0;
fi
for((k=0;k<$totalNum;k++))
do
var1=${arrayAll[k]}
echo "### index k = ${k}, value = ${var1}"
tmp=$(echo $var1 | cut -d',' -f4)
if [[ ${tmp} == "REPORT" ]]
then
echo -e "${var1} is report"
arrReport=(${arrReport[@]} $var1)
echo -e "arrReport size ${#arrReport[*]} \n"
elif [[ ${tmp} == "DATAREFRESH" ]]
then
echo -e "${var1} is data-refresh"
arrDt=(${arrDt[@]} $var1)
echo -e "arrDt size ${#arrDt[*]} \n"
fi
done
if [[ ${#arrDt[*]} -gt 0 ]];
then
## 1. insert 2 records
run_seq
runId=$(echo ${arrDt[0]} | cut -d',' -f1)
tmp=$(echo ${arrDt[0]} | cut -d',' -f2)
freq=${tmp: -1}
asOfDt=$(echo ${arrDt[0]} | cut -d',' -f3)
echo -e "insert 2 records the frequency is ${freq}, the cob_date is ${asOfDt} \n"
sqlStr="insert into ${tableName} partition(as_of_date='$asOfDt') select $sequenceNum1,$runId,'RECON_${freq}','REPORT',CAST(from_unixtime(unix_timestamp(current_timestamp())) AS STRING) as trigger_time,'INITIATED','run report'; insert into ${tableName} partition(as_of_date='$asOfDt') select $sequenceNum2,$runId,'DQ_${freq}','REPORT',from_unixtime(unix_timestamp()),'INITIATED','run report';"
echo -e "\n insert SQL: ${sqlStr} \n"
$(beeline -u ${beeline_str} --ShowHeader=false --outputformat=csv2 -e "${sqlStr}" )
echo -e "\n insert DQ and Recon for ${freq} end.\n"
dqStr="${runId},DQ_${freq},${asOfDt},REPORT"
arrReport=(${arrReport[@]} $dqStr)
reconStr="${runId},RECON_${freq},${asOfDt},REPORT"
arrReport=(${arrReport[@]} $reconStr)
echo -e "arrReport size is ${#arrReport[*]} "
## 2. run data-refresh enrichment
echo -e "\n---------- Do Data-refresh! ----------records number = ${#arrDt[*]} "
processBatch "${arrDt[@]}"
fi
if [ ${#arrReport[*]} -gt 0 ];
then
echo -e "\n---------- Run Reports! ----------records number = ${#arrReport[*]} "
processBatch "${arrReport[@]}"
else
echo -e "---------- Don't run Reports! ----------------------------------------\n"
fi
else
{
echo "Job already running with processID $psID";
echo "Can not re-run the existing instance script! "
exit 0;
}
fi
exit 0;
这里,设置了步长step,就是每次可以并行的脚本batch.sh数量,此脚本会触发spark程序,执行业务处理。
batch.sh如下:
#!/bin/ksh
param1=$1
param2=$2
param3=$3
name='ENRICH__'$param3
echo "-----------------start to run-------------------- ${param3} ---------------------APP_PATH=${APP_PATH}"
spark-submit --master yarn \
--name $name \
--deploy-mode client \
--driver-memory 10g \
--num-executors 4 \
--executor-cores 8 \
--executor-memory 10g \
--conf spark.driver.cores=10 \
--conf spark.driver.maxResultSize=8g \
--conf spark.default.parallelism=10 \
--conf spark.sql.shuffle.partitions=10 \
--conf spark.rdd.compress=true \
--conf spark.locality.wait=6 \
--conf spark.sql.autoBroadcastJoinThreshold=-1 \
--conf spark.sql.caseSensitive=false \
--conf spark.dynamicAllocation.enabled=false \
--conf spark.network.timeout=300s \
--conf spark.executor.memoryOverhead=4096 \
--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
--class com.XXX.XXXApplication $APP_PATH $param1 $param2 >${project_dir}/logs/job_$param3.log 2>&1