读取统计本批次的数据(textFileStream,socketTextStream)
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
/**
* 读取来socket的数据
* */
object CustomD {
def main(args: Array[String]): Unit = {
//连接
val sc = new SparkContext(new SparkConf().setAppName("testsocket").setMaster("local[6]"))
//处理数据批次时间
val ssc: StreamingContext = new StreamingContext(sc,Seconds(5))
sc.setLogLevel("ERROR")
//接收数据
val value: ReceiverInputDStream[String] = ssc.socketTextStream("node01",9999)
//读数据进行处理(单词统计)
val result: DStream[(String, Int)] = value.flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_)
result.print()
//启动,阻塞线程等待接收数据
ssc.start()
ssc.awaitTermination()
}
}
自定义读取数据,需要sparkStreaming的数据源进行数据的采集处理通过继承Receiver,并实现onStart、onStop方法来自定义数据源采集。
import java.io.{BufferedReader, InputStream, InputStreamReader}
import java.net.Socket
import java.nio.charset.StandardCharsets
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.receiver.Receiver
/*
* 自定义接收数据,通过继承Receiver,并实现onStart、onStop方法来自定义数据源采集。
* */
class UserDefinedReceiver extends Receiver[String](StorageLevel.MEMORY_AND_DISK_SER_2){
//接收数据
def receiver(): Unit ={
//启动一个socket
val socket = new Socket("node01",9999)
//从socket获取输入流
val is: InputStream = socket.getInputStream
val reader = new BufferedReader(new InputStreamReader(is,StandardCharsets.UTF_8))
var line:String =null
//读取是否有下一行 是否停止
while ((line =reader.readLine()) != null && !isStopped()){
//将接收的存储起来,等待一个批次后处理
store(line)
}
socket.close()
}
//启动的时候调用
override def onStart(): Unit = {
//创建线程不断从socker端口获取数据
new Thread(){
override def run(): Unit = {
//接收数据
receiver()
}
}
}
//停止的时候调用,但是监听获取数据的时候一般不会停止
override def onStop(): Unit = {}
}
def main(args: Array[String]): Unit = {
val sc = new SparkContext(new SparkConf().setMaster("local[6]").setAppName("maintest"))
val ssc = new StreamingContext(sc,Seconds(5))
//调用自定义方法得到数据
val data: ReceiverInputDStream[String] = ssc.receiverStream(new UserDefinedReceiver)
data.print()
//开启等待
ssc.start()
ssc.awaitTermination()
}
读取全局数据统计,需要用到updateStateByKey
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Milliseconds, Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
/*
* 批次读取数据,统计全局数据量
* */
object Demo {
def main(args: Array[String]): Unit = {
val sc: SparkContext = new SparkContext(new SparkConf().setMaster("local[6]").setAppName("testuddatset"))
//1秒处理一次
val ssc = new StreamingContext(sc,Milliseconds(1000))
sc.setLogLevel("ERROR")
//设置中间数据存放路径
ssc.checkpoint("E:\\Idea\\sparksql1010\\data\\countword")
val data: ReceiverInputDStream[String] = ssc.socketTextStream("node01",9999)
//updateStateByKey 需要有一个函数,对中间数据和当前数据处理的函数
def updateFunc(newvalue:Seq[Int],running:Option[Int]): Option[Int] ={
Some(newvalue.sum + running.getOrElse(0))
}
data.flatMap(_.split(" ")).map((_,1)).updateStateByKey(updateFunc _).print()
ssc.start()
ssc.awaitTermination()
}
}
窗口操作
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Milliseconds, Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
/*
* 窗口函数测试
* */
object Demo {
def main(args: Array[String]): Unit = {
val sc = new SparkContext(new SparkConf().setAppName("name").setMaster("local[6]"))
//设置500毫秒一个批次
val ssc = new StreamingContext(sc,Seconds(1))
sc.setLogLevel("ERROR")
//窗口函数必须有中间数据存储文件
ssc.checkpoint("E:\\Idea\\sparksql1010\\data\\countword")
val data: ReceiverInputDStream[String] = ssc.socketTextStream("node01",9999)
val words : DStream[(String, Int)] = data.flatMap(_.split(" ")).map((_,1))
//窗口长度,窗口批次处理时间
//当窗口长度大于窗口批次处理时间 会处理重复数据
//当窗口长度小于窗口批次处理时间,处理数据回出现丢失数据
val windowscount: DStream[(String, Int)] = words.window(Seconds(3),Seconds(2))
windowscount.reduceByKey(_+_).print()
ssc.start()
ssc.awaitTermination()
}
}