1,因业务需要,需要自己定义数据源,来一直产生数据,需要继承 Receiver类
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.receiver.Receiver
/**
* @Author: wpp
* @Date: 2020/5/4 23:36
*
*/
//自定义数据源
class CustomSourceReceiver(host: String, port: Int) extends Receiver[String](StorageLevel.MEMORY_ONLY) {
//最初启动的时候,调用该方法,作用为:读取数据并将数据发送给spark
override def onStart(): Unit = {
var idx = 0
new Thread("Socket Receiver") {
override def run(): Unit = {
while (true){
idx=idx+1
store( idx.toString)
Thread.sleep(1000)
}
}
}.start()
}
override def onStop(): Unit = {}
}
2,测试代码如下:
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* @Author: wpp
* @Date: 2020/5/5 0:18
*
*/
object CustomStreamSourceTest {
def main(args: Array[String]): Unit = {
//初始化信息
val sparkConf: SparkConf = new SparkConf().setAppName("App").setMaster("local[*]")
//初始化sparkStream
val ssc: StreamingContext = new StreamingContext(sparkConf,Seconds(5))
//创建自定义的receiver的Streaming
val lineStream: ReceiverInputDStream[String] = ssc.receiverStream(new CustomSourceReceiver("hadoop100",8888))
//将每一行的数据切分,形成一个个的单词
val wordStream: DStream[String] = lineStream.flatMap((_.split("\t")))
//将单词映射成元组
val wordAndOneStream: DStream[(String, Int)] = wordStream.map((_,1))
//将相同的单词次数做统计
val wordCount: DStream[(String, Int)] = wordAndOneStream.reduceByKey(_+_)
//打印结果
wordCount.print()
//启动Spark StreamingContext
ssc.start()
ssc.awaitTermination()
}
}
3,需要引入相关sparkstreaming的jar包即可