使用 yarn 的spark-submit提交时,加入超序列化参数:
--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
另,附:Spark-submit提交作业的shell版本模板。
一个好的shell脚本,不仅让人看起来赏心悦目,更易于查找问题,方便交接
#!/bin/sh
>>descripe
shell template
@author zjf
@date 2018-05-28
descripe
## source files
. /etc/profile
. ~/.bash_profile
## open spark GC log
export SPARK_SUBMIT_OPTS=" -Xloggc:tmp/gc_log -XX:+PrintGCApplicationStoppedTime -XX:+PrintGCDetails -XX:+PrintGCDateStamps -verbose:gc -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=10 -XX:GCLogFileSize=10M "
## args check
if [ $# -eq 0 ]
then
CUR_DATE=`date -d "-1 day" +%Y-%m-%d`
CUR_PARTITION=`date -d "-1 day" +%Y%m%d`
BEGIN_PARTITION=`date -d "-1 day" +%Y%m%d`
END_PARTITION=`date -d "-1 day" +%Y%m%d`
BEGIN_DATE=`date -d "-1 day" +%Y-%m-%d`
END_DATE=`date -d "-1 day" +%Y-%m-%d`
elif [ $# -eq 1 ]
then
format_day=`echo $1|grep -o '[0-9]\{8\}'`
format_hour=`echo $1|grep -o '[0-9]\{2\}$'`
CUR_DATE=`date -d "$format_day" +%Y-%m-%d`
CUR_PARTITION=`date -d "$format_day" +%Y%m%d`
LAST_MONTH=`date -d "${format_day} -1 month" +%Y%m`
BEGIN_DATE=`date -d "${format_day}" +%Y-%m-%d`
END_DATE=`date -d "${format_day}" +%Y-%m-%d`
BEGIN_PARTITION=`date -d "${format_day}" +%Y%m%d`
END_PARTITION=`date -d "${format_day}" +%Y%m%d`
else
echo "the args is wrong ,you should give it like '20170718'"
exit 1;
fi
WORK_DIR=$(cd `dirname $0`; pwd)
BASH_DIR=$WORK_DIR/bin
DATA_DIR=$WORK_DIR/data/$CUR_PARTITION
LOGS_DIR=$WORK_DIR/log/$CUR_PARTITION
echo "CUR_DATE=${CUR_DATE}"
echo "CUR_PARTITION=${CUR_PARTITION}"
echo "LAST_MONTH=${LAST_MONTH}"
echo "BEGIN_PARTITION=${BEGIN_PARTITION}"
echo "END_PARTITION=${END_PARTITION}"
echo "BEGIN_DATE=${BEGIN_DATE}"
echo "END_DATE=${END_DATE}"
COMMAND="$1"
echo "RUN BATCH : ${COMMAND}"
class_home="cn.com.xiaomi.Test"
/home/zjf/bin/spark-submit \
--cluster hadoopprc-hadoop-spark2.1 \
--conf spark.yarn.job.owners=xxx \
--conf spark.yarn.alert.phone.number=188888899 \
--conf spark.yarn.alert.mail.address='xxx@xiaomi.com' \
--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
--conf spark.storage.memoryFraction=0.5 \
--conf spark.shuffle.memoryFraction=0.3 \
--class ${class_home} \
--master yarn \
--deploy-mode cluster \
--queue production.queue..miui.game \
--conf spark.executor.extraJavaOptions=\"-XX:MaxDirectMemorySize=1024m\" \
--conf spark.default.parallelism=1600 \
--driver-memory 14g \
--executor-memory 14g \
--executor-cores 2 \
--num-executors 400 \
$WORK_DIR/huyu-cdi-spark-1.0-SNAPSHOT.jar \
${COMMAND}