在集群中跑spark streaming程序的方法:
1、spark-submit --class com.streaming.NetworkWordCount
–master spark://master:7077
–deploy-mode client
–driver-memory 512m
–executor-memory 512m
–total-executor-cores 4
–executor-cores 2
jar包的位置
2、spark-shell --master spark://master:7077 --total-executor-cores 4 --executor-cores 2(主要用于测试)
1、Queue Streams Source本地运行例子
queue Streams Source主要用于测试
import org.apache.spark.{SparkConf,SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Seconds,StreamingContext}
import scala.collection.mutable.Queue
val sparkConf=new SparkConf().setAppName("Queuestream")
val sc=new SparkContext(sparkConf)
val ssc =new StreamingContext(sc,Seconds(1))
val rddQueue=new Queue[RDD[Int]]()
//Input DStream
val inputStream=ssc.queueStream(rddQueue)
//Transformed DStream
val maapedStream=inputStream.map(x=>(x%10,1))
val reducedStream=maapedStream.reduceByKey(_+_)
//Output DStream
reducedStream.print()
ssc.start()
//输入到RDD队列的是由1到1000的Int数字组成的,并且分成10个分区
rddQueue+=ssc.sparkContext.makeRDD(1 to 1000,10)
ssc.stop(false)
2、HDFS File本地运行例子
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{LongWritable,Text}
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.{SparkConf,SparkContext}
import org.apache.spark.streaming.{Seconds,StreamingContext}
val sparkConf=new SparkConf().setAppName("HDFSFileStream")
val sc=new SparkContext(sparkConf)
val ssc=new StreamingContext(sc,Seconds(2))
val filePath="hdfs://nameservice1/user/hive/warehouse/tmp"
//Input DStream
//filePath表示监控的文件目录
//filter(Path=>Boolean)表示符合条件的文件
//is NewFile 表示streaming app启动的时候是否需要处理已经存在的文件(为false,可用spark.streaming.fileStream.minRememberDuration=60s 设置监控新文件的窗口时间)
val lines=ssc.fileStream[LongWritable,Text,TextInputFormat](filePath,(path: Path)=>path.toString.contains("test"),true).map(_._2.toString)
//Transformed DStream
val words=lines.flatMap(_.split(" "))
val wordCounts=words.map(x=>(x,1)).reduceByKey(_+_)
//Output DStream
wordCounts.print()
ssc.start()
ssc.awaitTermination()
ssc.stop(false)
3、socket本地运行例子
import org.apache.spark.{SparkConf,SparkContext}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds,StreamingContext}
//wordCount程序,spark streaming消费tcp server发过来的实时数据的例子
val sparkConf=new SparkConf().setAppName("NetworkWordCount")
val sc=new SparkContext(sparkConf)
val ssc=new StreamingContext(sc,Seconds(1))
//数据接收器(receiver)
//创建一个接收器(receiverInputDStream),这个接收器接收一台机器上的某个端口通过socket发送过来的数据并处理
val lines=ssc.socketTextStream("master",9998,StorageLevel.MEMORY_AND_DISK_SER)
val words=lines.flatMap(_.split(" "))
val wordPairs=words.map(x=>(x,1))
val wordCounts=wordPairs.reduceByKey(_+_)
wordCounts.print()
ssc.start()
//等待streaming程序中止
ssc.awaitTermination()
中止程序
jps命令找到SparkSubmit然后kill掉