官网
先导入依赖
groupId = org.apache.spark
artifactId = spark-streaming-flume_2.12
version = 2.4.5
1.Spark Streaming整合flume(push方式)
首先编写flume配置文件
测试source使用netcat,sink必须使用avro
# simple-agent:agent名称 netcat-source:source名称 avro-sink:sink名称 memory-channel:channel名称
simple-agent.sources = netcat-source
simple-agent.sinks = avro-sink
simple-agent.channels = memory-channel
# 设置source参数,这边设置了source类型netcat,绑定192.168.0.133,端口44444
simple-agent.sources.netcat-source.type = netcat
simple-agent.sources.netcat-source.bind = hadoop000
simple-agent.sources.netcat-source.port = 44444
# 设置sinks参数,这边设置了sinks类型avro,hostname设置绑定输出服务器,端口41414
simple-agent.sinks.avro-sink.type = avro
simple-agent.sinks.avro-sink.hostname = 192.168.0.133
simple-agent.sinks.avro-sink.port = 41414
# 设置channel参数,这边设置了类型为memory
simple-agent.channels.memory-channel.type = memory
# 将四者绑定起来
simple-agent.sources.netcat-source.channels = memory-channel
simple-agent.sinks.avro-sink.channel = memory-channel
接下来我们写Spark Streaming的maven项目
def main(args: Array[String]): Unit = {
//新建一个SparkConf 将Master和AppName设置进去
val conf: SparkConf = new SparkConf()
.setAppName("FlumePush")
.setMaster("local[2]")
//新建一个StreamingContext,第一个参数是上面定义的SparkConf
//第二个参数是个Seconds类型,表示多久进行一次批处理
val ssc = new StreamingContext(conf, Seconds(4))
val Array(hostname,port) = args
//通过FlumeUtils.createStream创建流,3个参数,分别是StreamingContext,服务器地址,端口号
val flumeStream: ReceiverInputDStream[SparkFlumeEvent] = FlumeUtils.createStream(ssc, "0.0.0.0",41414)
//通过一个map获得数据
flumeStream.map(x=>new String(x.event.getBody.array()).trim)
//处理数据
.flatMap(_.split(" "))
.map(x=>(x,1))
.reduceByKey(_+_)
//打印
.print()
ssc.start()
ssc.awaitTermination()
}
我们先启动Spark Streaming的maven项目,然后再启动flume
测试:
查看控制台:
2.Spark Streaming整合flume(pull方式)
官网
首先编写flume配置文件
测试source使用netcat,sink必须使用avro
# simple-agent:agent名称 netcat-source:source名称 spark-sink:sink名称 memory-channel:channel名称
simple-agent.sources = netcat-source
simple-agent.sinks = spark-sink
simple-agent.channels = memory-channel
# 设置source参数,这边设置了source类型netcat,绑定192.168.0.133,端口44444
simple-agent.sources.netcat-source.type = netcat
simple-agent.sources.netcat-source.bind = 192.168.0.133
simple-agent.sources.netcat-source.port = 44444
# 设置sinks参数,这边设置了sinks类型SparkSink,hostname设置绑定输出服务器,端口41414
simple-agent.sinks.spark-sink.type = org.apache.spark.streaming.flume.sink.SparkSink
simple-agent.sinks.spark-sink.hostname = hadoop000
simple-agent.sinks.spark-sink.port = 41414
# 设置channel参数,这边设置了类型为memory
simple-agent.channels.memory-channel.type = memory
# 将四者绑定起来
simple-agent.sources.netcat-source.channels = memory-channel
simple-agent.sinks.avro-sink.channel = memory-channel
接下来我们写Spark Streaming的maven项目
相比上面的push方法,我们只要变动创建流使用的方法就行了
def main(args: Array[String]): Unit = {
//新建一个SparkConf 将Master和AppName设置进去
val conf: SparkConf = new SparkConf()
.setAppName("FlumePush")
.setMaster("local[2]")
//新建一个StreamingContext,第一个参数是上面定义的SparkConf
//第二个参数是个Seconds类型,表示多久进行一次批处理
val ssc = new StreamingContext(conf, Seconds(4))
val Array(hostname,port) = args
//通过FlumeUtils.createPollingStream创建流,3个参数,分别是StreamingContext,服务器地址,端口号
val flumeStream: ReceiverInputDStream[SparkFlumeEvent] = FlumeUtils.createPollingStream(ssc, "0.0.0.0",41414)
//通过一个map获得数据
flumeStream.map(x=>new String(x.event.getBody.array()).trim)
//处理数据
.flatMap(_.split(" "))
.map(x=>(x,1))
.reduceByKey(_+_)
//打印
.print()
ssc.start()
ssc.awaitTermination()
}
在pull方法中我们要先启动flume,然后在启动Spark Streaming的maven项目
测试:
查看控制台: