Spark-shell 参数
Spark-shell 是以一种交互式命令行方式将Spark应用程序跑在指定模式上,也可以通过Spark-submit提交指定运用程序,Spark-shell 底层调用的是Spark-submit,二者的使用参数一致的,通过- -help 查看参数:
-master: 指定运行模式,spark://host:port, mesos://host:port, yarn, or local[n].
-deploy-mode: 指定将driver端运行在client 还是在cluster.
-class: 指定运行程序main方法类名,一般是应用程序的包名+类名
-name: 运用程序名称
-jars: 需要在driver端和executor端运行的jar,如mysql驱动包
-packages: maven管理的项目坐标GAV,多个以逗号分隔
-conf: 以key=value的形式传入sparkconf参数,所传入的参数必须是以spark.开头
-properties-file: 指定新的conf文件,默认使用spark-default.conf
-driver-memory:指定driver端运行内存,默认1G
-driver-cores:指定driver端cpu数量,默认1,仅在Standalone和Yarn的cluster模式下
-executor-memory:指定executor端的内存,默认1G
-total-executor-cores:所有executor使用的cores
-executor-cores: 每个executor使用的cores
-driver-class-path: driver端的classpath
-executor-class-path:executor端的classpath
nohup /spark/bin/spark-submit \
--class com.eptok.scala.stream.Stream \
--master yarn-client \
--num-executors 2 --executor-memory 2G \
--executor-cores 2 --driver-memory 2G \
--jars $(echo /home/mongo/spark/spark_interface-1.0/lib/*.jar | tr ' ' ',') \
/spark/spark_interface-1.0/spark-1.0.jar >>bigscreem.out &
下面是实现消费kafka
object BigScreenStream {
def main(args: Array[String]): Unit = {
val port=5555
val hdfs_file_path = "/spark/streaming/stop/bigScreen" //判断消 息文件是否存在,如果存在就
// val time=args(0).toLong
// val flag=args(1)
// val firstCount=args(2).toLong
// val threadCount=args(3).toLong
val time=3
val flag="0"
val firstCount=5000
val threadCount=100
val server=new Server(port)
//创建StreamingContext
val ssc=createStreamingContext(time,flag,firstCount,threadCount)
//开始执行
ssc.start()
//StreamUtiils.startHttpServer(server,ssc)
//启动接受停止请求的守护进程
// StreamUtiils.daemonHttpServer(server,ssc) //方式一通过Http方式优雅的关闭策略
//
StreamUtils.stopByMarkFile(server,ssc,hdfs_file_path) //方式二通过扫描HDFS文件来优雅的关闭
//StreamUtiils.checkStreamingState(server,ssc)
//等待任务终止
ssc.awaitTermination()
}
}
def createStreamingContext(time :Long,flag :String,firstCount :Long,threadCount :Long):StreamingContext={
val appName=this.getClass.getName
println("appName:"+appName)
var conf = new SparkConf()
.setMaster(Property.getProperty("yarn"))
.setAppName(appName)
conf.set("spark.streaming.stopGracefullyOnShutdown","true")//优雅的关闭
conf.set("spark.streaming.receiver.writeAheadLog.enable","true")//防止数据丢失
conf.set("spark.streaming.backpressure.enabled","true")//激活削峰功能
conf.set("spark.streaming.backpressure.initialRate",firstCount.toString)//第一次读取的最大数据值
conf.set("spark.streaming.kafka.maxRatePerPartition",threadCount.toString)//每个进程每秒最多从kafka读取的数据条数
conf.set("spark.mongodb.input.uri", Property.getProperty("bigScreenInUri1"))
conf.set("spark.mongodb.output.uri", Property.getProperty("bigScreenOutUri1"))
conf.set("spark.streaming.kafka.consumer.poll.ms","10000")//拉取数据超时时间
val ssc = new StreamingContext(conf, Seconds(time))
if("0".equals(flag)){
ssc.sparkContext.setLogLevel("ERROR")
}
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> Property.getProperty("kafkaServers"),
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> Property.getProperty("group"),
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val topics = Array(Property.getProperty("bigScreenTopics"))
val stream = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
val offlineTranType=Property.getProperty("offlineTranType")
val values=offlineTranType.split(",")
var map: util.Map[String, String]=new util.HashMap()
for( i <- 0 to values.length-1){
map.put(values(i),values(i))
}
System.out.println("线下业务码:"+map)
stream.foreachRDD { rdd =>
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
val baseRdd = rdd.map(record => (record.value)).filter(!"null".equals(_)).distinct().filter(
rdd=>{
val json = JSONObject.fromObject(rdd)
val K_BUSINESS_TYPE = json.get("K_BUSINESS_TYPE").asInstanceOf[String]
val K_TRADE_TRANTYPE = json.get("K_TRADE_TRANTYPE").asInstanceOf[String]
if("offline".equals(K_BUSINESS_TYPE)){
//通过线下业务码过滤交易
if(map.containsKey(K_TRADE_TRANTYPE)){
//System.out.println("线下业务码匹配成功:"+K_TRADE_TRANTYPE)
true
}else{
//System.out.println("线下业务码匹配失败:"+K_TRADE_TRANTYPE)
false
}
}else{
val K_TRADE_TYPE = json.get("K_TRADE_TYPE").asInstanceOf[String]
true
}
}
)
baseRdd.persist(StorageLevel.MEMORY_AND_DISK)
…
}