目录
依赖包
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.4.5</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.4.5</version>
<!--<scope>provided</scope>-->
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming-kafka-0-10 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>2.4.5</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.4.5</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.6.6</version>
</dependency>
从文件中采集
val conf = new SparkConf().setMaster("local[2]").setAppName("demo1")
val ssc = new StreamingContext(conf,Seconds(5))
val fileDStream = ssc.textFileStream("data/file")
val wordCount = fileDStream.flatMap(line=>line.split("\\s+"))
.map((_,1))
.reduceByKey(_+_)
wordCount.print()
ssc.start()
ssc.awaitTermination()
自定义采集
import java.io.{BufferedReader, InputStreamReader}
import org.apache.spark.{SparkConf}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.receiver.Receiver
class MyReceiver(host:String,port:Int) extends Receiver[String](StorageLevel.MEMORY_ONLY){
var socket:java.net.Socket=null
def receive():Unit={
val socket = new java.net.Socket(host,port)
val reader = new BufferedReader(new InputStreamReader(socket.getInputStream,"UTF-8"))
var line:String=null;
while((line=reader.readLine())!=null){
if(line.equals("end")){
return
}else{
this.store(line)
}
}
}
override def onStart():Unit = {
new Thread(new Runnable{
override def run():Unit={
receive()
}
}).start()
}
override def onStop():Unit = {
if(socket != null){
socket.close()
socket=null
}
}
}
object MyReceiverDemo{
def main(args:Array[String]):Unit = {
val conf = new SparkConf().setMaster("local[2]").setAppName("demo3")
val ssc = new StreamingContext(conf,Seconds(5))
val receiverStream = ssc.receiverStream(new MyReceiver("192.168.136.10",9999))
val streamCount = receiverStream.flatMap(_split("\\s+"))
.map((_,1))
.reduceByKey(_+_)
streamCount.print()
ssc.start()
ssc.awaitTermination()
}
}
从kafka收集数据
无状态
val conf = new SparkConf().setMaster("local[2]").setAppName("spark-kafka")
val ssc = new StreamingContext(conf, Seconds(5))
//与kafka建立连接
val kafkaParams: Map[String, String] = Map(
(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "192.168.136.10:9092"),
(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.GROUP_ID_CONFIG->"spark-kafka")
)
//接收producer的信息
val kafkaStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
ssc,
//分区设置策略:1.每个Executor作为一个consumer
LocationStrategies.PreferConsistent,
// LocationStrategies.PreferBrokers
// LocationStrategies.PreferFixed()
//订阅固定的topic集
ConsumerStrategies.Subscribe(Set("sparkDemo"), kafkaParams)
)
//需求实现,wordcount
val wordCount = kafkaStream.flatMap(v => v.value().toString.split(" "))
.map((_, 1))
.reduceByKey(_ + _)
wordCount.print()
//启动
ssc.start()
ssc.awaitTermination()
producer生成消息:
kafka-console-producer.sh --topic spark-kafka --broker-list 192.168.136.10:9092
有状态
需要设立检查点checkpoint
val conf = new SparkConf().setMaster("local[2]").setAppName("demo3")
val ssc = new StreamingContext(conf, Seconds(5))
//建立checkpoint
ssc.checkpoint("checkpoint")
//连接kafka
val kafka: Map[String, String] = Map(
(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.136.10:9092"),
(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.GROUP_ID_CONFIG, "kafkaGroup1")
)
//获得producer数据
val stream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(Set("kafkaWindow1"), kafka)
)
//需求实现
val msg = stream.flatMap(v => v.value().toString.split("\\s+"))
.map((_, 1))
//有状态统计
val sum: DStream[(String, Int)] = msg.updateStateByKey {
case (seq, buffer) => {
val allSum: Int = buffer.getOrElse(0) + seq.sum
Option(allSum)
}
}
sum.print()
//启动
ssc.start()
ssc.awaitTermination()
producer生产消息:
kafka-console-producer.sh --topic kafkaWindow1 --broker-list 192.168.136.10:9092
有状态和无状态相比,有状态的情况下,会统计历史数据,在进行单词计数时,历史的单词也会计算进去。
窗口函数
window
新窗口与旧窗口没有交集
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
val conf = new SparkConf().setMaster("local[2]").setAppName("demo1")
val ssc = new StreamingContext(conf,Seconds(5))
val sparkKafka: Map[String, String] = Map(
(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.136.10:9092"),
(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.GROUP_ID_CONFIG, "window1")
)
val KafkaWindow: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(Set("windowStream"), sparkKafka)
)
val wordCount = KafkaWindow.flatMap(v => v.value().toString.split("\\s+"))
.map((_, 1))
//必须是前面设置的时间间隔(5s)的倍数
.window(Seconds(5))
.reduceByKey(_ + _)
wordCount.print()
ssc.start()
ssc.awaitTermination()
producer生产消息:
kafka-console-producer.sh --topic windowStream --broker-list 192.168.136.10:9092
新窗口与旧窗口有交集
val conf = new SparkConf().setMaster("local[2]").setAppName("demo1")
val ssc = new StreamingContext(conf,Seconds(2))
val sparkKafka: Map[String, String] = Map(
(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.136.10:9092"),
(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.GROUP_ID_CONFIG, "window1")
)
val KafkaWindow: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(Set("windowStream"), sparkKafka)
)
val wordCount = KafkaWindow.flatMap(v => v.value().toString.split("\\s+"))
.map((_, 1))
//必须是前面设置的时间间隔(5s)的倍数
.window(Seconds(4),Seconds(2))
.reduceByKey(_ + _)
wordCount.print()
ssc.start()
ssc.awaitTermination()
producer生产消息:
kafka-console-producer.sh --topic windowStream --broker-list 192.168.136.10:9092
countByWindow
统计当前时间窗口的元素个数
需要设立检查点checkpoint
val conf = new SparkConf().setMaster("local[2]").setAppName("demo2")
val ssc = new StreamingContext(conf,Seconds(2))
ssc.checkpoint("checkpoint")
val sparkKafka: Map[String, String] = Map(
(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.136.10:9092"),
(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.GROUP_ID_CONFIG, "Window2")
)
val kafkamsg: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(Set("windowStream"), sparkKafka)
)
val res = kafkamsg.flatMap(v => v.value().toString.split("\\s+"))
.map((_, 1))
.countByWindow(Seconds(4), Seconds(2))
res.print()
ssc.start()
ssc.awaitTermination()
生产者消费:
kafka-console-producer.sh --topic windowStream --broker-list 192.168.136.10:9092
随着消息的产生,它只统计总的单词的个数
countByValueAndWindow
统计当前时间窗口中元素值相同的元素个数
需要设立检查点checkpoint
val conf = new SparkConf().setMaster("local[2]").setAppName("demo3")
val ssc = new StreamingContext(conf,Seconds(2))
ssc.checkpoint("checkpoint")
val sparkKafka: Map[String, String] = Map(
(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.136.10:9092"),
(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.GROUP_ID_CONFIG, "Window3")
)
val kafkaMsg: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(Set("windowStream"), sparkKafka)
)
val res: DStream[(String, Long)] = kafkaMsg.flatMap(v => v.value().toString.split("\\s+"))
.countByValueAndWindow(Seconds(4), Seconds(2))
res
res.print()
ssc.start()
ssc.awaitTermination()
生产者消费:
kafka-console-producer.sh --topic windowStream --broker-list 192.168.136.10:9092
reduceByWindow
在调用DStream上首先取窗口函数的元素形成新的DStream,然后在窗口元素形成的DStream上进行reduce
val conf = new SparkConf().setMaster("local[2]").setAppName("demo4")
val ssc = new StreamingContext(conf,Seconds(2))
val kafkaStream: Map[String, String] = Map(
(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.136.10:9092"),
(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, " org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, " org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.GROUP_ID_CONFIG, "window4")
)
val kafkaMsg: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(Set("windowStream"), kafkaStream)
)
val wordcount = kafkaMsg.flatMap(_.value().toString.split("\\s+"))
.reduceByWindow(_ + ":" + _, Seconds(8), Seconds(4))
wordcount.print()
ssc.start()
ssc.awaitTermination()
reduceByKeyAndWindow
按照key值运算
val conf = new SparkConf().setMaster("local[2]").setAppName("demo5")
val ssc = new StreamingContext(conf,Seconds(2))
ssc.checkpoint("checkpoint")
val kafkaStream = Map(
(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.136.10:9092"),
(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.GROUP_ID_CONFIG, "window5")
)
val kafkaMsg: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(Set("sparkDemo"), kafkaStream)
)
val res: DStream[(String, Int)] = kafkaMsg.flatMap(_.value().toString.split("\\s+"))
.map((_, 1))
.reduceByKeyAndWindow((x: Int, y: Int) => {
x + y
}, Seconds(4), Seconds(2))
//这个用法需要设立检查点
// .reduceByKeyAndWindow((x:Int,y:Int)=>{x+y},(x:Int,y:Int)=>{x-y},Seconds(4),Seconds(2))
res.print()
ssc.start()
ssc.awaitTermination()
生产者生产和前面一样,这里即不再赘述。
transform
支持任意的RDD到RDD的映射操作。
val conf = new SparkConf().setMaster("local[2]").setAppName("transform")
val ssc = new StreamingContext(conf,Seconds(2))
val sparkKafka: Map[String, String] = Map(
(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.136.10:9092"),
(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.GROUP_ID_CONFIG, "transform1")
)
val kafkamsg: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(Set("sparkDemo"), sparkKafka))
val wordCount: DStream[((String, String), Int)] = kafkamsg.transform((rdd, timestamp) => {
val format: SimpleDateFormat = new SimpleDateFormat("yyyyMMdd HH:mm:ss")
val time: String = format.format(timestamp.milliseconds)
val value: RDD[((String, String), Int)] = rdd.flatMap(x => x.value().toString.split("\\s+"))
.map(x => ((x, time), 1))
.reduceByKey(_ + _)
.sortBy(_._2, false)
value
})
wordCount.print()
ssc.start()
ssc.awaitTermination()
transform源码:
传入的参数是自定义函数,自定义函数的返回值为RDD,transform整个返回值类型为DStream.
def transform[U: ClassTag](transformFunc: (RDD[T], Time) => RDD[U]): DStream[U] = ssc.withScope {
// because the DStream is reachable from the outer object here, and because
// DStreams can't be serialized with closures, we can't proactively check
// it for serializability and so we pass the optional false to SparkContext.clean
val cleanedF = context.sparkContext.clean(transformFunc, false)
val realTransformFunc = (rdds: Seq[RDD[_]], time: Time) => {
assert(rdds.length == 1)
cleanedF(rdds.head.asInstanceOf[RDD[T]], time)
}
new TransformedDStream[U](Seq(this), realTransformFunc)
}
SQLContext
用SQL语句来进行操作
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object streamSQLDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[2]").setAppName("streamsql")
val ssc = new StreamingContext(conf,Seconds(2))
val kafkaStream: Map[String, String] = Map(
(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.136.10:9092"),
(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.GROUP_ID_CONFIG, "streamsql")
)
val kafkamsg: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(Set("sparkDemo"), kafkaStream)
)
kafkamsg
val wordcount: DStream[Row] = kafkamsg.transform(rdd => {
val sqlContext = SQLContextSinleton.getInstance(rdd.sparkContext)
import sqlContext.implicits._
val value = rdd.flatMap(x => x.value().toString.split("\\s+")).map((_, 1))
value.toDF("name", "cn")
.createOrReplaceTempView("tbword")
val frame = sqlContext.sql("select name,count(cn) from tbword group by name")
frame.rdd
})
wordcount.print()
ssc.start()
ssc.awaitTermination()
}
}
object SQLContextSinleton{
@transient private var instance:SQLContext=_
def getInstance(sc:SparkContext):SQLContext={
synchronized(
if(instance == null) {
instance = new SQLContext(sc)
}
)
instance
}
}
再来一个
这种方式输出的是表格形式:
object Demo_SQLWordCount {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("demo").setMaster("local[*]")
val ssc = new StreamingContext(conf,Seconds(5))
//sparkSession需写在ssc后面
val spark = SparkSession.builder().config(conf).getOrCreate()
import spark.implicits._
val lines = ssc.socketTextStream("hadoop01",7777)
val wrods = lines.flatMap(_.split("\\s+"))
wrods.foreachRDD(rdd=>{
if(rdd.count()!=0) {
val df = rdd.map(x=>Word(x)).toDF()
df.createOrReplaceTempView("tb_word")
spark.sql("select word,count(1) from tb_word group by word")
.show()
}
})
//value.print()
ssc.start()
ssc.awaitTermination()
}
case class Word(word:String)
}