Shell脚本控制并发执行spark程序

最新推荐文章于 2024-07-16 17:58:05 发布

jane3von

最新推荐文章于 2024-07-16 17:58:05 发布

阅读量1k

点赞数

分类专栏： Shell 文章标签： shell

本文链接：https://blog.csdn.net/jane3von/article/details/119446924

版权

Shell 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

第一次写一个很复杂的Shell为了实现以下功能：

1.查询hive状态表，表中根据request_type类型不同，执行数据refresh或者report逻辑。如果发现有refresh，默认插入2条report数据。

2.如果同时有数据refresh和report，默认一定要refresh执行完，才能跑report.

3.因为这是hive做状态控制，所有之前发现当有前台数据录入此表，后台autosys执行此脚本同时发生时，产生了数据的不一致性。为此修改了逻辑，只会执行一次hive数据查询，即使发生同时插入和同时查询，那就牺牲时间，在autosys下次执行时，才把数据查询得到。曾经考虑过hive分区锁表，但是因为急于上线，有时间研究下怎么设置锁。

脚本如下：

#!/bin/ksh

dir=/data/1/XXX/bin/projects/1.0.0-SNAPSHOT/Configure/
currTime=`date +%Y%m%d%H%M%S`
echo -e "Start this script time: $currTime \n"
. /data/1/XXX/bin/XXX/commonlib.ksh
META_SCHEMA=XXX_META

if [[ "$RPM_ENVIRONMENT" = "DEV" ]] || [[ "$RPM_ENVIRONMENT" = "SIT" ]]
then
    echo -e "--- this is ${RPM_ENVIRONMENT} env ----------------------------"
    kinit -k -t /opt/Cloudera/keytabs/`whoami`.`hostname -s`.keytab `whoami`/`hostname -f`@XXX.NET
    export beeline_str="jdbc:hive2://bigdataplatformXXX.net:10000/default;principal=hive/bigdataplatform-XXX.net@XXX.NET;ssl=true"
    export META_PASS='XXX'
    export XXX_META_CONN_STR=XXX_META/'"'${META_PASS}'"'@XXX:XXX/XXX
elif [[ "$RPM_ENVIRONMENT" = "UAT" ]]
then
    echo -e "--- this is ${RPM_ENVIRONMENT} env ----------------------------"
     kinit -k -t /opt/Cloudera/keytabs/`whoami`.`hostname -s`.keytab `whoami`/`hostname -f`@XXX.NET
    export beeline_str="jdbc:hive2://bigdataplatformXXX.net:10000/default;principal=hive/bigdataplatform-XXX.net@XXX.NET;ssl=true"
    export META_PASS='XXX'
    export XXX_META_CONN_STR=XXX_META/'"'${META_PASS}'"'@XXX:XXX/XXX
elif [[ "$RPM_ENVIRONMENT" = "PROD" ]]
then
    echo -e "--- this is ${RPM_ENVIRONMENT} env ----------------------------"
     kinit -k -t /opt/Cloudera/keytabs/`whoami`.`hostname -s`.keytab `whoami`/`hostname -f`@XXX.NET
    export beeline_str="jdbc:hive2://bigdataplatformXXX.net:10000/default;principal=hive/bigdataplatform-XXX.net@XXX.NET;ssl=true"
    export META_PASS='XXX'
    export XXX_META_CONN_STR=XXX_META/'"'${META_PASS}'"'@XXX:XXX/XXX
fi

function run_seq(){
    run_sql "${META_SCHEMA}" "select REPORT_STATUS_FIN_SEQUENCE.nextval || '|' from dual;"
    result1=$sql_result_set
    export sequenceNum1=$( echo $result1 | cut -d'|' -f1 )
    echo -e "sequenceNum1=${sequenceNum1}"
    run_sql "${META_SCHEMA}" "select XXX_SEQUENCE.nextval || '|' from dual;"
    result2=$sql_result_set
    export sequenceNum2=$( echo $result2 | cut -d'|' -f1 )
    echo -e "sequenceNum2=${sequenceNum2}"
}

function usage() {
    for pid in $(jobs -p)
    do
      wait $pid
      status=$?
      echo "## status=${status} ,pid=${pid} "
      if [ $status != 0 ];then
         echo "## $pid status is $status have some error!" >> $dir/batchTrigger.log
      else
         echo "## $pid status is $status success!" >> $dir/batchTrigger.log
      fi
    done
}

function runSomeReport(){
    array=$1
    start=$2
    end=$3
    echo "array=$array,start=$start,end=$end"

    for((j=$start;j<$end;j++))
    do
        var1=${array[j]}
        echo "### run i = ${j}, value = ${var1}"
        runId=$(echo $var1 | cut -d',' -f1)
        formName=$(echo $var1 | cut -d',' -f2)
        asOfDate=$(echo $var1 | cut -d',' -f3)
        requestType=$(echo $var1 | cut -d',' -f4)
        param2="#COB#=${asOfDate};#FORM_NAME#=${formName};#RUN_ID#=${runId};#REQUEST_TYPE#=${requestType}"
        echo $param2
        sh $dir"batch.sh" "my" $param2 $formName &
        
    done
    echo "### waiting report finish. i=${j}"
    usage
}

function processBatch(){
    array=($@)
    totalNum=${#array[*]}
    echo -e "array=${array},totalNum=${totalNum}"
    actualNum=`expr $totalNum - 1`
    echo -e "### Need run ${totalNum} reports! actualNum=${actualNum} \n"
    step=6
    echo -e "### step=$step \n"
    loopNum=`expr $totalNum / $step`
    echo -e "### loopNum=$loopNum \n"
    loopN=`expr $loopNum + 1`
    echo "### loopN = ${loopN} "
    echo "---------------------------------"
    echo -e "\n\n"

    for((k=0;k<=$loopNum;k++))
    do
        start=$(($k*$step))
        echo "start=${start}"
        tmp=$(($k+1))
        echo "tmp=$tmp"
        end=$(($tmp*$step))
        echo "end=$end"
        echo "start=${start}, end=${end}, k=${k}"
        if [ $end -gt $actualNum ];
        then
            echo -e "\nLast start run batchReport...."
            runSomeReport $array $start $totalNum
            echo -e "Last run batchReport end... \n\n"
        else
            echo -e "Start run batchReport.... "
            runSomeReport $array $start $end
            echo -e "Run batchReport end...\n\n"
        fi
    done
}

arrReport=()
arrDt=()

psID=`ps ax | grep "com.XXX.XXXApplication" | grep -v grep | awk '{print $1}'`
echo " psID = ${psID} "
if [ -z "${psID}" ];
then
    echo -e "No process is running now! Can process now...\n"
    tableName="HIVE_DB_NAME.HIVE_TBL_NAME"
    sqlAll="select id,name,as_of_date,request_type from (select id,name,as_of_date,request_type,concat_ws(',',collect_list(status)) as combine_status from ${tableName} where upper(request_type)='DATAREFRESH' group by id,name,as_of_date,request_type) as tmp where tmp.combine_status='INITIATED' union select id,name,as_of_date,request_type from (select id,name,as_of_date,request_type,concat_ws(',',collect_list(status)) as combine_status from ${tableName} where upper(request_type)='REPORT' group by id,name,as_of_date,request_type) as tmp where tmp.combine_status='INITIATED';"
    
    dataAll=$(beeline -u ${beeline_str} --ShowHeader=false --outputformat=csv2 -e "${sqlAll}")

    arrayAll=(${dataAll//\n/ })
    totalNum=${#arrayAll[*]}
    echo -e "arrayAll=${arrayAll},totalNum=${totalNum} \n"

    if [[ $totalNum -eq 0 ]]
    then
        echo "No record need to do...exit now!"
        exit 0;
    fi

    for((k=0;k<$totalNum;k++))
        do
            var1=${arrayAll[k]}
            echo "### index k = ${k}, value = ${var1}"
            tmp=$(echo $var1 | cut -d',' -f4)
            if [[ ${tmp} == "REPORT" ]]
            then
                echo -e "${var1} is report"
                arrReport=(${arrReport[@]} $var1)
                echo -e "arrReport size ${#arrReport[*]} \n"

            elif [[ ${tmp} == "DATAREFRESH" ]]
            then
                echo -e "${var1} is data-refresh"
                arrDt=(${arrDt[@]} $var1)
                echo -e "arrDt size ${#arrDt[*]} \n"
            fi
        done

    if [[ ${#arrDt[*]} -gt 0 ]];
    then
        ## 1. insert 2 records
        run_seq
        runId=$(echo ${arrDt[0]} | cut -d',' -f1)
        tmp=$(echo ${arrDt[0]} | cut -d',' -f2)
        freq=${tmp: -1}
        asOfDt=$(echo ${arrDt[0]} | cut -d',' -f3)
        echo -e "insert 2 records the frequency is ${freq}, the cob_date is ${asOfDt} \n"
        sqlStr="insert into ${tableName} partition(as_of_date='$asOfDt') select $sequenceNum1,$runId,'RECON_${freq}','REPORT',CAST(from_unixtime(unix_timestamp(current_timestamp())) AS STRING) as trigger_time,'INITIATED','run report'; insert into ${tableName} partition(as_of_date='$asOfDt') select $sequenceNum2,$runId,'DQ_${freq}','REPORT',from_unixtime(unix_timestamp()),'INITIATED','run report';"
        echo -e "\n insert SQL: ${sqlStr} \n"

        $(beeline -u ${beeline_str} --ShowHeader=false --outputformat=csv2 -e "${sqlStr}" )
        echo -e "\n insert DQ and Recon for ${freq} end.\n"

        dqStr="${runId},DQ_${freq},${asOfDt},REPORT"
        arrReport=(${arrReport[@]} $dqStr)
        reconStr="${runId},RECON_${freq},${asOfDt},REPORT"
        arrReport=(${arrReport[@]} $reconStr)
        echo -e "arrReport size is ${#arrReport[*]} "

        ## 2. run data-refresh enrichment
        echo -e "\n---------- Do Data-refresh! ----------records number = ${#arrDt[*]} "
        processBatch "${arrDt[@]}"
    fi

    if [ ${#arrReport[*]} -gt 0 ];
    then
        echo -e "\n---------- Run Reports! ----------records number = ${#arrReport[*]} "
        processBatch "${arrReport[@]}"
    else
        echo -e "---------- Don't run Reports! ----------------------------------------\n"
    fi

else
{
    echo "Job already running with processID $psID";
    echo "Can not re-run the existing instance script! "
    exit 0;
}
fi

exit 0;

这里，设置了步长step，就是每次可以并行的脚本batch.sh数量，此脚本会触发spark程序，执行业务处理。

batch.sh如下：

#!/bin/ksh

param1=$1
param2=$2
param3=$3
name='ENRICH__'$param3

echo "-----------------start to run-------------------- ${param3} ---------------------APP_PATH=${APP_PATH}"

spark-submit --master yarn \
--name $name \
--deploy-mode client \
--driver-memory 10g \
--num-executors 4 \
--executor-cores 8 \
--executor-memory 10g \
--conf spark.driver.cores=10 \
--conf spark.driver.maxResultSize=8g \
--conf spark.default.parallelism=10 \
--conf spark.sql.shuffle.partitions=10 \
--conf spark.rdd.compress=true \
--conf spark.locality.wait=6 \
--conf spark.sql.autoBroadcastJoinThreshold=-1 \
--conf spark.sql.caseSensitive=false \
--conf spark.dynamicAllocation.enabled=false \
--conf spark.network.timeout=300s \
--conf spark.executor.memoryOverhead=4096 \
--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
--class com.XXX.XXXApplication $APP_PATH $param1 $param2 >${project_dir}/logs/job_$param3.log 2>&1

jane3von

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
Shell脚本控制并发执行spark程序

第一次写一个很复杂的Shell为了实现以下功能：1.查询hive状态表，表中根据request_type类型不同，执行数据refresh或者report逻辑。如果发现有refresh，默认插入2条report数据。2.如果同时有数据refresh和report，默认一定要refresh执行完，才能跑report.3.因为这是hive做状态控制，所有之前发现当有前台数据录入此表，后台autosys执行此脚本同时发生时，产生了数据的不一致性。为此修改了逻辑，只会执行一次hive数据查询，即使...
复制链接

扫一扫