spark上游数据源之flume

spark从flume中拉取数据

  • flume版本 1.8.0,spark2.2.0
  • 新增flume-poll.conf
# 定义这个 agent 中各组件的名字 
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# 描述和配置 source 组件:r1 
##注意:不能往监控目录中重复丢同名文件 
a1.sources.r1.channels = c1
a1.sources.r1.type = spooldir
a1.sources.r1.spoolDir = /home/spark/logs
# 要监控的文件夹所在的目录
a1.sources.r1.fileHeader = true

# 描述和配置 channels 组件c1,此处使用是memory内存缓存的方式
a1.channels.c1.type =memory
a1.channels.c1.capacity = 20000
a1.channels.c1.transactionCapacity=5000

# 描述和配置 sinks 组件:k1 
a1.sinks.k1.channel = c1
a1.sinks.k1.type = org.apache.spark.streaming.flume.sink.SparkSink
# flume所在的主机ip
a1.sinks.k1.hostname=192.168.189.144
a1.sinks.k1.port = 8888
a1.sinks.k1.batchSize= 2000
# 批处理大小

将spark/jars下的scala-library-2.11.8.jar拷贝到flume/lib目录中
下载spark-streaming-flume-sink_2.11-2.0.2.jar到flume/lib目录中
启动flume

bin/flume-ng agent -n a1 -c conf -f conf/flume-poll.conf -Dflume.root.logger=INFO,console

spark代码

object FlumePollWordCount {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("FlumePollWordCount").setMaster("local[2]")
    //Seconds(5) 5秒钟计算一次
    val ssc = new StreamingContext(conf, Seconds(5))
    //从flume中拉取数据(flume的地址)
    val address = Seq(new InetSocketAddress("192.168.189.144", 8888))
    val flumeStream = FlumeUtils.createPollingStream(ssc, address, StorageLevel.MEMORY_AND_DISK)
    val words = flumeStream.flatMap(x => new String(x.event.getBody().array()).split(" ")).map((_,1))
    val results = words.reduceByKey(_+_)
    results.print()
    ssc.start()
    ssc.awaitTermination()
  }
}

flume推送数据到spark中

新增flume-push.conf

# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# source
a1.sources.r1.type = spooldir
a1.sources.r1.spoolDir = /export/data/flume
a1.sources.r1.fileHeader = true

# Describe the sink
a1.sinks.k1.type = avro
#这是接收方
a1.sinks.k1.hostname = 192.168.189.1
a1.sinks.k1.port = 8000

# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

启动flume

bin/flume-ng agent -n a1 -c conf -f conf/flume-push.conf -Dflume.root.logger=INFO,console

spark代码

object FlumePushWordCount {

  def main(args: Array[String]): Unit = {
    LoggerLevels.setStreamingLogLevels()
    val conf = new SparkConf().setMaster("local[2]").setAppName("FlumePushWordCount")
    val ssc = new StreamingContext(conf, Seconds(5))
    //推送方式: flume向spark发送数据
    val flumeStream = FlumeUtils.createStream(ssc, "192.168.189.1", 8000)
    //flume中的数据通过event.getBody()才能拿到真正的内容
    val words = flumeStream.flatMap(x => new String(x.event.getBody().array()).split(" ")).map((_, 1))

    val results = words.reduceByKey(_ + _)
    results.print()
    ssc.start()
    ssc.awaitTermination()
  }
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值