push
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.flume.{FlumeUtils, SparkFlumeEvent}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object flume_push_streaming {
Logger.getLogger("org").setLevel(Level.WARN)
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("flume_push_streaming").setMaster("local[*]")
val ssc = new StreamingContext(sparkConf, Seconds(5))
val flumeStreaming: ReceiverInputDStream[SparkFlumeEvent] = FlumeUtils.createStream(ssc, "本地ip", 41414)
flumeStreaming.map(x => new String(x.event.getBody.array()).trim).flatMap(_.split(" ")).map(x => (x, 1)).reduceByKey(_ + _).print()
ssc.start()
ssc.awaitTermination()
}
}
启动:
-
启动程序
-
编写 flume_push_streaming.conf 并启动flume (示例)
-
启动
telnet 虚拟机主机名(hadoop01) 44444
输入测试数据 -
查看程序运行窗口是否有结果
pull
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.flume.{FlumeUtils, SparkFlumeEvent}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object flume_pull_streaming {
Logger.getLogger("org").setLevel(Level.WARN)
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("flume_pull_streaming").setMaster("local[*]")
val ssc = new StreamingContext(sparkConf, Seconds(5))
val flumeStreaming: ReceiverInputDStream[SparkFlumeEvent] = FlumeUtils.createPollingStream(ssc, "hadoop01", 41414)
flumeStreaming.map(x => new String(x.event.getBody.array()).trim).flatMap(_.split(" ")).map(x => (x, 1)).reduceByKey(_ + _).print()
ssc.start()
ssc.awaitTermination()
}
}
启动:
- 编写 flume_pull_streaming.conf 并启动flume (示例)
- 启动程序
- 启动
telnet 虚拟机主机名(hadoop01) 44444
输入测试数据 - 查看程序运行窗口是否有结果
对比:
- 在编写代码方面创建Flume流方式不一样
- 配置文件中sink不一样
- 在启动时启动顺序是反的
参考官网:https://spark.apache.org/docs/2.2.0/streaming-flume-integration.html