流式处理socket数据,实现单词统计
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
object SparkStreamingTCP {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("sparkstreamingTCP")
val sc: SparkContext = new SparkContext(conf)
sc.setLogLevel("WARN")
val scc = new StreamingContext(sc,Seconds(5))
val lines: ReceiverInputDStream[String] = scc.socketTextStream("sunjunwei.com",9999)
val words: DStream[String] = lines.flatMap(line => line.split(" "))
val wordAndOne: DStream[(String, Int)] = words.map(word => (word,1))
val result: DStream[(String, Int)] = wordAndOne.reduceByKey((x, y) => x+y)
result.print()
scc.start()
scc.awaitTermination()
}
}
流式处理接收socket数据,实现单词统计并且每个批次数据结果累加
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
object SparkStreamingTCPTotal {
System.setProperty("hadoop.home.dir","E:/x3/hadoop-2.9.2")
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("sparkStreamingTCPTotal")
val sc: SparkContext = new SparkContext(conf)
sc.setLogLevel("WARN")
val streamingContext: StreamingContext = new StreamingContext(sc,Seconds(5))
streamingContext.checkpoint("[./]")
val lines: ReceiverInputDStream[String] = streamingContext.socketTextStream("sunjunwei.com",9999)
val words: DStream[String] = lines.flatMap(_.split(" "))
val wordAndOne: DStream[(String, Int)] = words.map((_,1))
val result: DStream[(String, Int)] = wordAndOne.updateStateByKey(updateFunction)
result.print()
streamingContext.start()
streamingContext.awaitTermination()
}
def updateFunction(newValues: Seq[Int],runningCount: Option[Int]): Option[Int] ={
val newCount = newValues.sum + runningCount.getOrElse(0)
Some(newCount)
}
}
开窗函数,一定时间内单词统计
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
object SparkStreamingTCPWindow {
System.setProperty("hadoop.home.dir","E:/x3/hadoop-2.9.2")
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("spark_window").setMaster("local[2]")
val sc: SparkContext = new SparkContext(conf)
sc.setLogLevel("WARN")
val context: StreamingContext = new StreamingContext(sc,Seconds(5))
val linse: ReceiverInputDStream[String] = context.socketTextStream("sunjunwei.com",9999)
val words: DStream[String] = linse.flatMap(_.split(" "))
val word: DStream[(String, Int)] = words.map((_,1))
val result: DStream[(String, Int)] = word.reduceByKeyAndWindow((a:Int, b:Int)=>a+b,Seconds(10),Seconds(10))
result.print()
context.start()
context.awaitTermination()
}
}
开窗函数,统计一定时间内的热门词汇
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
object SparkStreamingTCPWindowHotWords {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("hot_words").setMaster("local[2]")
val sc: SparkContext = new SparkContext(conf)
sc.setLogLevel("WARN")
val scc: StreamingContext = new StreamingContext(sc,Seconds(5))
val lines: ReceiverInputDStream[String] = scc.socketTextStream("sunjunwei.com",9999)
val words: DStream[String] = lines.flatMap(_.split(" "))
val word: DStream[(String, Int)] = words.map((_,1))
val result: DStream[(String, Int)] = word.reduceByKeyAndWindow((a:Int, b:Int)=>a+b,Seconds(10),Seconds(10))
val data = result.transform(rdd => {
val sortRdd: RDD[(String, Int)] = rdd.sortBy(t => t._2, false)
val sortResult: Array[(String, Int)] = sortRdd.take(3)
println("************************print top 3 begin*************************")
sortResult.foreach(println)
println("************************ print top 3 end *************************")
sortRdd
})
data.print()
scc.start()
scc.awaitTermination()
}
}
kafka–receiver
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
object SparkStreamingKafkaReceiver {
System.setProperty("hadoop.home.dir","E:/x3/hadoop-2.9.2")
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("kafka_receiver").setMaster("local[4]")
val sc: SparkContext = new SparkContext(conf)
sc.setLogLevel("WARN")
val ssc: StreamingContext = new StreamingContext(sc,Seconds(5))
ssc.checkpoint("./kafka-receiver")
val inputstream = KafkaUtils.createStream(ssc, "sunjunwei.com:2181", "group_01", Map("spark_kafka1" -> 1))
val result = inputstream.map(tuple => tuple._2).flatMap(_.split(" ")).map((_,1)).updateStateByKey(updateFunc)
result.print()
ssc.start()
ssc.awaitTermination()
}
def updateFunc(aa:Seq[Int],bb:Option[Int]): Option[Int] ={
Some(aa.sum + bb.getOrElse(0))
}
}
整合flume–poll模式
import java.net.InetSocketAddress
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.flume.FlumeUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
object SparkStreamingFlumePoll {
System.setProperty("hadoop.home.dir", "E:/x3/hadoop-2.9.2")
def updateFunction(newValues: Seq[Int], runningCount: Option[Int]): Option[Int] = {
val newCount =runningCount.getOrElse(0)+newValues.sum
Some(newCount)
}
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("SparkStreaming_Flume_Poll").setMaster("local[2]")
val sc = new SparkContext(sparkConf)
sc.setLogLevel("WARN")
val scc = new StreamingContext(sc, Seconds(5))
scc.checkpoint("./")
val flumeStream = FlumeUtils.createPollingStream(scc,"sunjunwei.com",8888,StorageLevel.MEMORY_AND_DISK)
val lineStream = flumeStream.map(x=>new String(x.event.getBody.array()))
val result = lineStream.flatMap(_.split(" ")).map((_,1)).updateStateByKey(updateFunction)
result.print()
scc.start()
scc.awaitTermination()
}
}
整合flume–push模式
import java.net.InetSocketAddress
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.flume.{FlumeUtils, SparkFlumeEvent}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
object SparkStreamingFlumePush {
System.setProperty("hadoop.home.dir", "E:/x3/hadoop-2.9.2")
def updateFunction(newValues: Seq[Int], runningCount: Option[Int]): Option[Int] = {
val newCount =runningCount.getOrElse(0)+newValues.sum
Some(newCount)
}
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("SparkStreaming_Flume_Push").setMaster("local[2]")
val sc = new SparkContext(sparkConf)
val scc = new StreamingContext(sc, Seconds(5))
sc.setLogLevel("WARN")
scc.checkpoint("./")
val flumeStream = FlumeUtils.createStream(scc,"192.168.0.110",8888,StorageLevel.MEMORY_AND_DISK)
val lineStream = flumeStream.map(x=>new String(x.event.getBody.array()))
val result = lineStream.flatMap(_.split(" ")).map((_,1)).updateStateByKey(updateFunction)
result.print(30)
scc.start()
scc.awaitTermination()
}
}