通过监控7777端口读取数据:
添加maven依赖:
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.4.5</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.4.5</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>2.4.5</version>
</dependency>
在Linux中启动nc:
nc -lk 7777
编写Scala程序:
package nj.zb.kb09.spark
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object SparkStreamDemo1 {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("demo1")
// 采集周期,指定的3秒为每次采集的时间间隔
val streamingContext = new StreamingContext(sparkConf,Seconds(3))
// 指定采集的方法
val socketLineStream: ReceiverInputDStream[String] = streamingContext.socketTextStream("192.168.237.100",7777)
//将采集来的信息进行处理,统计数据(wordcount)
val wordStream: DStream[String] = socketLineStream.flatMap(line=>line.split("\\s+"))
val mapStream: DStream[(String, Int)] = wordStream.map(x=>(x,1))
val wordcountstream: DStream[(String, Int)] = mapStream.reduceByKey(_+_)
//打印
wordcountstream.print()
//启动采集器
streamingContext.start()
streamingContext.awaitTermination()
}
}
出现报错,因为有一个依赖版本过高:
Caused by: com.fasterxml.jackson.databind.JsonMappingException: Incompatible Jackson version: 2.9.6
需要添加依赖:
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.6.6</version>
</dependency>
如果出现报错:
NoSuchMethodError: org.apache.hadoop.conf.Configuration.getPassword(Ljava/lang/String;
需要添加依赖:
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.0</version>
</dependency>
原因:在默认版本是2.2.0,其中没有这个方法
需要手动添加依赖:hadoop-common-2.6.0.jar
这时候在Linux中输入的内容会在控制台打印wordcount单词统计:
根据指定的采集周期,每次采集的时间间隔3秒。spark streaming的本质是微批处理。
-------------------------------------------
Time: 1608366480000 ms
-------------------------------------------
(world,1)
(hello,1)
-------------------------------------------
Time: 1608366483000 ms
-------------------------------------------
(hello,1)
(java,1)
-------------------------------------------
spark streaming的采集周期:
采集周期,指定的3秒为每次采集的时间间隔
val streamingContext = new StreamingContext(sparkConf,Seconds(3))
发生了什么:
def this(conf: SparkConf, batchDuration: Duration) = {
this(StreamingContext.createNewSparkContext(conf), null, batchDuration)
}
sparkConf作为参数new SparkContext(conf)
Seconds(3)作为参数newGraph.setBatchDuration(_batchDur)
为StreamingContext的属性graph:DStreamGraph的变量batchDuration赋值 // 批处理的时间间隔
TODO:找到了很好的文章,以后整理:https://www.cnblogs.com/upupfeng/p/12325201.html
指定采集的方法:
输入DStreams表示从数据源获取的原始数据流。
streamingContext.socketTextStream(“192.168.237.100”,7777)
socketTextStream方法:
def socketTextStream(
hostname: String,
port: Int,
storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2
): ReceiverInputDStream[String] = withNamedScope(“socket text stream”) {
socketStream[String](hostname, port, SocketReceiver.bytesToLines, storageLevel)
}
/* Creates an input stream from TCP source hostname:port. Data is received using a TCP socket and the receive bytes is interpreted as UTF8 encoded\n
delimited lines. */
通过tcp socket监听hostname:port,接受字节(UTF8编码,\n
分隔)
Spark Streaming拥有两类数据源
(1)基本源(Basic sources):这些源在StreamingContext API中直接可用。例如文件系统、套接字连接、Akka的actor等。
(2)高级源(Advanced sources):这些源包括Kafka,Flume,Kinesis,Twitter等等。
1、基本数据源输入源码
SparkStream 对于外部的数据输入源,一共有下面几种:
(1)用户自定义的数据源:receiverStream
(2)根据TCP协议的数据源: socketTextStream、socketStream
(3)网络数据源:rawSocketStream
(4)hadoop文件系统输入源:fileStream、textFileStream、binaryRecordsStream
(5)其他输入源(队列形式的RDD):queueStream
sparkStreaming从文件中读取数据:
package nj.zb.kb09.spark
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
object SparkStreamFileDataSourceDemo {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("fileDataSource")
val streamingContext = new StreamingContext(sparkConf,Seconds(5))
val fileDStream: DStream[String] = streamingContext.textFileStream("in/test")
val wordStream: DStream[String] = fileDStream.flatMap(line=>line.split("\\s+"))
val mapStream: DStream[(String, Int)] = wordStream.map((_,1))
val sumStream: DStream[(String, Int)] = mapStream.reduceByKey((_+_))
sumStream.print()
streamingContext.start()
streamingContext.awaitTermination()
//修改时间 1分钟 改内容+重命名
}
}
SparkStreaming从Kafka中读取数据:
需要添加依赖:
<!-- spark-streaming-kafka-0-10 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
package nj.zb.kb09.spark
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object SparkStreamKafkaSource {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setAppName("kafkaDemo").setMaster("local[2]")
val streamingContext = new StreamingContext(sparkConf,Seconds(5))
//kafka的配置参数
val kafkaParams = Map(
(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "192.168.237.100:9092"),
(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.GROUP_ID_CONFIG -> "kafkaGroup1")
)
// 从Kafka数据源获取的原始数据流
val kafkaStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
streamingContext, //不再是直接从streamingContext点出来的基本源,而是作为参数生成InputDStream
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(Set("sparkKafkaDemo"), kafkaParams) //sparkKafkaDemo是生产者的topic
)
val wordStream: DStream[String] = kafkaStream.flatMap(v => v.value().toString.split("\\s+"))
val mapStream: DStream[(String, Int)] = wordStream.map((_,1))
val sumStream: DStream[(String, Int)] = mapStream.reduceByKey((_+_))
sumStream.print()
streamingContext.start()
streamingContext.awaitTermination()
//kafka-topics.sh --create --zookeeper hadoop100:2181 --topic sparkKafkaDemo --partitions 1 --replication-factor 1
//Created topic "sparkKafkaDemo".
//kafka-console-producer.sh --broker-list hadoop100:9092 --topic sparkKafkaDemo
}
}
Kafka本地策略:LocationStrategies.PreferConsistent
大多数情况下,SparkStreaming读取数据使用 LocationStrategies.PreferConsistent 这种策略,这种策略会将分区均匀的分布在集群的Executor之间。
如果Executor在kafka 集群中的某些节点上,可以使用 LocationStrategies.PreferBrokers 这种策略,那么当前这个Executor 中的数据会来自当前broker节点。
如果节点之间的分区有明显的分布不均,可以使用 LocationStrategies.PreferFixed 这种策略,可以通过一个map 指定将topic分区分布在哪些节点中。
参考阅读:https://www.cnblogs.com/wcgstudy/p/11117888.html
自定义receiver采集器:
package nj.zb.kb09.spark
import java.io.{BufferedReader, InputStreamReader}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.receiver.Receiver
class MyReceiver(host:String,port:Int) extends Receiver[String](StorageLevel.MEMORY_ONLY){
var socket:java.net.Socket=null
def receive(): Unit ={
socket = new java.net.Socket(host,port)
val reader = new BufferedReader(
new InputStreamReader(socket.getInputStream,"UTF-8")
)
var line:String = null
while ((line=reader.readLine())!=null){
if(line.equals("end")){
return
}else{
this.store(line)
}
}
}
override def onStart(): Unit = {
new Thread(new Runnable {
override def run(): Unit = {
receive()
}
}).start()
}
override def onStop(): Unit = {
if (socket!=null){
socket.close()
socket=null
}
}
}
object MyReceiverDemo{
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("myReceiverDemo")
val streamingContext = new StreamingContext(sparkConf,Seconds(5))
val receiverStream: ReceiverInputDStream[String] = streamingContext.receiverStream(new MyReceiver("192.168.237.100",7777))
val lineStream: DStream[String] = receiverStream.flatMap(line=>line.split("\\s+"))
val mapStream: DStream[(String, Int)] = lineStream.map((_,1))
val sumStream: DStream[(String, Int)] = mapStream.reduceByKey((_+_))
sumStream.print()
streamingContext.start()
streamingContext.awaitTermination()
}
}