作业提交脚本
[root@tony-client-1-001 ~]# vim /mnt/tony/rec_model/model/data/f0.sh
#!/usr/bin/env bash
curr_dir=`pwd`
src_path=${curr_dir}
spark_lib_path='/usr/hdp/2.5.0.0-1245/spark/lib'
hbase_lib_path='/usr/hdp/2.5.0.0-1245/hbase/lib'
echo ${spark_lib_path}
echo ${hbase_lib_path}
spark-submit \
--master yarn \
--deploy-mode cluster \
--num-executors 4 \
--executor-memory 4G \
--executor-cores 4 \
--driver-memory 4G \
--queue fintech \
--name 'yhl_f_0' \
--jars ${
spark_lib_path}/spark-examples-1.6.2.2.5.0.0-1245-hadoop2.7.3.2.5.0.0-1245.jar,${
hbase_lib_path}/hbase-server.jar,${
hbase_lib_path}/hbase-protocol.jar,${
hbase_lib_path}/hbase-hadoop2-compat.jar,${
hbase_lib_path}/hbase-client.jar,${
hbase_lib_path}/hbase-common.jar,${
hbase_lib_path}/htrace-core-3.1.0-incubating.jar,/usr/hdp/2.6.1.0-129/hadoop/lib/hadoop-ks3-0.1.jar \
--driver-class-path ${
spark_lib_path}/spark-examples-1.6.2.2.5.0.0-1245-hadoop2.7.3.2.5.0.0-1245.jar,${
hbase_lib_path}/hbase-server.jar:${
hbase_lib_path}/hbase-protocol.jar:${
hbase_lib_path}/hbase-hadoop2-compat.jar:${
hbase_lib_path}/hbase-client.jar:${
hbase_lib_path}/hbase-common.jar:${
hbase_lib_path}/htrace-core-3.1.0-incubating.jar \
--conf spark.executor.extraClassPath=${
spark_lib_path}/spark-examples-1.6.2.2.5.0.0-1245-hadoop2.7.3.2.5.0.0-1245.jar,${
hbase_lib_path}/hbase-server.jar:${
hbase_lib_path}/hbase-protocol.jar:${
hbase_lib_path}/hbase-hadoop2-compat.jar:${
hbase_lib_path}/hbase-client.jar:${
hbase_lib_path}/hbase-common.jar:${
hbase_lib_path}/htrace-core-3.1.0-incubating.jar \
${src_path}/f0.py
和spark作业相关的python代码
[root@tony-client-1-001 ~]# vim /mnt/tony/rec_model/model/data/f0.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
import datetime
# os.environ['SPARK_HOME']="E:/sort/jars/spark-2.1.0-bin-hadoop2.7"
# sys.path.append("E:/sort/jars/spark-2.1.0-bin-hadoop2.7/python")
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.rdd import StorageLevel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.sql import SQLContext
from pyspark.sql.types import StructType ,StructField , StringType , Row
g_conf = {}
def show(x) :
print "############ " , x
def initConfig(conf):
for i in conf :
g_conf[i] = conf[i]
def init(conf):
initConfig(conf)
def f1(x):
srr = x.encode("utf-8").split("\t")
y = srr[0]
uuid = srr[1]
newsid = srr[2]
recid = srr[3]
click_tstp = srr[4]
cateid = srr[5]
click_region = srr[6]
uuid_group = srr[7]
timestamp = srr[8]
article_id = srr[9]
lda_title = srr[10]
lda_content = srr[