有两种方式,一种是push的方式flume主动推数据给sparkstreaming
有个弊端是一个sparkstream只能和一个flume对接
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-flume_2.10</artifactId>
<version>1.6.2</version>
</dependency>
import org.apache.spark.SparkConf
import org.apache.spark.streaming.flume.FlumeUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
object FlumePushWordCount {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("FlumeWordCount").setMaster("local[2]")
val ssc = new StreamingContext(conf,Seconds(5))
ssc.sparkContext.setLogLevel("WARN")
//推送方式:flume向spark发送数据
val flumeStream = FlumeUtils.createStream(ssc,"10.0.15.7",8888)
val words = flumeStream.
flatMap(
x=>new String(x.event.getBody().array()).split(" ")
).map((_,1))
val results = words.reduceByKey(_+_)
results.print()
ssc.start()
ssc.awaitTermination()
}
}
flume脚本
a1.sources=r1
a1.channels=c1
a1.sinks=s1
a1.sources.r1.type=spoolDir
a1.sources.r1.spoolDir=/home/hadoop/flumedata/spool1
a1.sources.r1.fileHeader=true
a1.sources.r1.fileHeaderKey=file
a1.channels.c1.type=memory
a1.channels.c1.capacity=1000
a1.channels.c1.transactionCapacity=100
a1.channels.c1.keep-alive=3
a1.channels.c1.byteCapacityBufferPercentage = 20
a1.channels.c1.byteCapacity = 800000
#a1.sinks.s1.type = logger
a1.sinks.s1.type = avro
a1.sinks.s1.hostname = 10.0.15.7
a1.sinks.s1.port = 8888
a1.sources.r1.channels=c1
a1.sinks.s1.channel=c1
另一种是pull的方式,pull的方式,可以从多个机器的flume拉数据
需要把三个jar包放到 flume/lib下面
commons.lang3-3.3.2.jar
spark-streaming-flume-sink-2.10.1.6.1.jar
scala-library-2.10.5.jar
package scala.SparkStreaming
import org.apache.spark.SparkConf
import org.apache.spark.streaming.flume.FlumeUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
object FlumePushWordCount {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("FlumeWordCount").setMaster("local[2]")
val ssc = new StreamingContext(conf,Seconds(5))
ssc.sparkContext.setLogLevel("WARN")
//拉取方式:streaming向flume拉取数据
//可以写多个地址
val address = Seq(new InetSocketAddress("192.168.206.201",8888))
val flumeStream = FlumeUtils.createPollingStream(ssc,address,StorageLevel.MEMORY_AND_DISK)
val words = flumeStream.
flatMap(
x=>new String(x.event.getBody().array()).split(" ")
).map((_,1))
val results = words.reduceByKey(_+_)
results.print()
ssc.start()
ssc.awaitTermination()
}
}
flume脚本
a1.sources=r1
a1.channels=c1
a1.sinks=s1
a1.sources.r1.type=spoolDir
a1.sources.r1.spoolDir=/home/hadoop/flumedata/spool1
a1.sources.r1.fileHeader=true
a1.sources.r1.fileHeaderKey=file
a1.channels.c1.type=memory
a1.channels.c1.capacity=1000
a1.channels.c1.transactionCapacity=100
a1.channels.c1.keep-alive=3
a1.channels.c1.byteCapacityBufferPercentage = 20
a1.channels.c1.byteCapacity = 800000
#a1.sinks.s1.type = logger
a1.sinks.s1.type = org.apache.spark.streaming.flume.sink.SparkSink
a1.sinks.s1.hostname = 192.168.206.201
a1.sinks.s1.port = 8888
a1.sources.r1.channels=c1
a1.sinks.s1.channel=c1