flume配置

Flume

  1. 解压安装包
  2. 修改conf/flume-env.sh,设置java_home

案例

官方案例

创建一个conf文件

# example.conf: A single-node Flume configuration
# Name the components on this agent
a1.sources = r1    # r1:表示a1的输入源
a1.sinks = k1	# k1:表示a1的输出目的地
a1.channels = c1	# c1:表示a1的缓冲区

# Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444

# Describe the sink
a1.sinks.k1.type = logger

# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
a1.sources.r1.channels = c1  # put
a1.sinks.k1.channel = c1	# take

单节点启动监听端口

$ bin/flume-ng agent --conf /flume/conf --conf-file example.conf --name a1 -Dflume.root.logger=INFO,console

1.1取本地文件到HDFS案例

案例需求:实时监控Hive日志,上传到HDFS

  1. flume想要将数据输出到HDFS,必须持有Hadoop相关的jar包
    在这里插入图片描述

  2. 创建flume-file-hdfs.conf文件

    # Name the components on this agent
    a2.sources = r2
    a2.sinks = k2
    a2.channels = c2
    
    # Describe/configure the source
    #定义source类型为exec可执行命令的
    a2.sources.r2.type = exec
    a2.sources.r2.command = tail -F /home/admin/modules/apache-hive-1.2.2-bin/hive.log
    #执行shell脚本的绝对路径
    a2.sources.r2.shell = /bin/bash -c
    
    # Describe the sink
    a2.sinks.k2.type = hdfs
    a2.sinks.k2.hdfs.path = hdfs://linux01:8020/flume/%Y%m%d/%H
    #上传文件的前缀
    a2.sinks.k2.hdfs.filePrefix = logs-
    #是否按照时间滚动文件夹
    a2.sinks.k2.hdfs.round = true
    #多少时间单位创建一个新的文件夹
    a2.sinks.k2.hdfs.roundValue = 1
    #重新定义时间单位
    a2.sinks.k2.hdfs.roundUnit = hour
    #是否使用本地时间戳
    a2.sinks.k2.hdfs.useLocalTimeStamp = true
    #积攒多少个Event才flush到HDFS一次
    a2.sinks.k2.hdfs.batchSize = 1000
    #设置文件类型,可支持压缩
    a2.sinks.k2.hdfs.fileType = DataStream
    #多久生成一个新的文件
    a2.sinks.k2.hdfs.rollInterval = 600
    #设置每个文件的滚动大小
    a2.sinks.k2.hdfs.rollSize = 134217700
    #文件的滚动与Event数量无关
    a2.sinks.k2.hdfs.rollCount = 0
    #最小冗余数
    a2.sinks.k2.hdfs.minBlockReplicas = 1
    
    # Use a channel which buffers events in memory
    a2.channels.c2.type = memory
    a2.channels.c2.capacity = 1000
    a2.channels.c2.transactionCapacity = 100
    
    # Bind the source and sink to the channel
    a2.sources.r2.channels = c2
    a2.sinks.k2.channel = c2
    

1.2监控目录文件到HDFS案例

  1. 创建文件flume-dir-hdfs.conf

    a3.sources = r3
    a3.sinks = k3
    a3.channels = c3
    
    # Describe/configure the source
    a3.sources.r3.type = spooldir
    a3.sources.r3.spoolDir = /home/admin/modules/apache-flume-1.7.0-bin/upload
    a3.sources.r3.fileHeader = true
    a3.sources.r3.fileSuffix = .COMPLETED
    #忽略所有以.tmp结尾的文件,不上传
    a3.sources.r3.ignorePattern = ([^ ]*\.tmp)
    
    # Describe the sink
    a3.sinks.k3.type = hdfs
    a3.sinks.k3.hdfs.path = hdfs://linux01:8020/flume/upload/%Y%m%d/%H
    #上传文件的前缀
    a3.sinks.k3.hdfs.filePrefix = upload-
    #是否按照时间滚动文件夹
    a3.sinks.k3.hdfs.round = true
    #多少时间单位创建一个新的文件夹
    a3.sinks.k3.hdfs.roundValue = 1
    #重新定义时间单位
    a3.sinks.k3.hdfs.roundUnit = hour
    #是否使用本地时间戳
    a3.sinks.k3.hdfs.useLocalTimeStamp = true
    #积攒多少个Event才flush到HDFS一次
    a3.sinks.k3.hdfs.batchSize = 100
    #设置文件类型,可支持压缩
    a3.sinks.k3.hdfs.fileType = DataStream
    #多久生成一个新的文件
    a3.sinks.k3.hdfs.rollInterval = 600
    #设置多大生成新文件
    a3.sinks.k3.hdfs.rollSize = 134217700
    #设置文件的滚动与Event数量无关
    a3.sinks.k3.hdfs.rollCount = 0
    #副本数
    a3.sinks.k3.hdfs.minBlockReplicas = 1
    
    # Use a channel which buffers events in memory
    a3.channels.c3.type = memory
    a3.channels.c3.capacity = 1000
    a3.channels.c3.transactionCapacity = 100
    
    # Bind the source and sink to the channel
    a3.sources.r3.channels = c3
    a3.sinks.k3.channel = c3
    
    
  2. 开启监控

    $ bin/flume-ng agent --conf /flume/conf --conf-file flume-dir-hdfs.conf --name a3
    

1.3单数据源多出口案例

  1. 单source、Channel多Sink组(负载均衡)

    [外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-z14A5tj5-1591837118068)(assets/1590888106025.png)]

    • 创建flume1-telnet-flume.conf

      # Name the components on this agent
      a1.sources = r1
      a1.sinkgroups = g1
      a1.sinks = k1 k2
      a1.channels = c1
      
      # Describe/configure the source
      a1.sources.r1.type = netcat
      a1.sources.r1.bind = localhost
      a1.sources.r1.port = 44444  #要监控的端口
      
      a1.sinkgroup.g1.processor.type = load_balance  #负载均衡
      a1.sinkgroup.g1.processor.backoff = true
      a1.sinkgroup.g1.processor.selector = round_robin  #sink1、sink2轮循,也可设置为随机 random
      a1.sinkgroup.g1.processor.selector.maxTimeOut = 10000
      
      # Describe the sink
      a1.sinks.k1.type = avro
      a1.sinks.k1.hostname = linux01
      a1.sinks.k1.port = 4141
      
      a1.sinks.k2.type = avro
      a1.sinks.k2.hostname = linux01
      a1.sinks.k2.port = 4142
      
      # Describe the channel
      a1.channels.c1.type = memory
      a1.channels.c1.capacity = 1000
      a1.channels.c1.transactionCapacity = 100
      
      # Bind the source and sink to the channel
      a1.sources.r1.channels = c1
      a1.sinkgroups.g1.sinks = k1 k2
      a1.sinks.k1.channel = c1
      a1.sinks.k2.channel = c1
      
    • 创建flume2-flume-console1.conf

      # Name the components on this agent
      a2.sources = r1
      a2.sinks = k1
      a2.channels = c1
      
      # Describe/configure the source
      a2.sources.r1.type = avro
      a2.sources.r1.bind = linux01
      a2.sources.r1.port = 4141
      
      # Describe the sink
      a2.sinks.k1.type = logger
      
      # Use a channel which buffers events in memory
      a2.channels.c1.type = memory
      a2.channels.c1.capacity = 1000
      a2.channels.c1.transactionCapacity = 100
      
      # Bind the source and sink to the channel
      a2.sources.r1.channels = c1
      a2.sinks.k1.channel = c1
      
    • 创建flume3-flume-console2.conf

      # Name the components on this agent
      a3.sources = r1
      a3.sinks = k1
      a3.channels = c1
      
      # Describe/configure the source
      a3.sources.r1.type = avro
      a3.sources.r1.bind = linux01
      a3.sources.r1.port = 4142
      
      # Describe the sink
      a2.sinks.k1.type = logger
      
      # Describe the channel
      a3.channels.c1.type = memory
      a3.channels.c1.capacity = 1000
      a3.channels.c1.transactionCapacity = 100
      
      # Bind the source and sink to the channel
      a3.sources.r1.channels = c1
      a3.sinks.k1.channel = c1
      
    • 启动

      # a2、a3在控制台打印
      $ bin/flume-ng agent --conf /flume/conf --conf-file flume3-flume-console2.conf --name a3 -Dflume.root.logger=INFO,console
      $ bin/flume-ng agent --conf /flume/conf --conf-file flume2-flume-console1.conf --name a2 -Dflume.root.logger=INFO,console
      $ bin/flume-ng agent --conf /flume/conf --conf-file flume1-telnet-flume.conf --name a1
      
      
  2. 单source多Channel、Sink

在这里插入图片描述

  • 创建flume1-file-flume.conf

    # Name the components on this agent
    a1.sources = r1
    a1.sinks = k1 k2
    a1.channels = c1 c2
    # 将数据流复制给多个channel
    a1.sources.r1.selector.type = replicating
    
    # Describe/configure the source
    a1.sources.r1.type = exec
    a1.sources.r1.command = tail -F /home/admin/modules/apache-hive-1.2.2-bin/hive.log
    a1.sources.r1.shell = /bin/bash -c
    
    # Describe the sink
    a1.sinks.k1.type = avro
    a1.sinks.k1.hostname = linux01 
    a1.sinks.k1.port = 4141
    
    a1.sinks.k2.type = avro
    a1.sinks.k2.hostname = linux01
    a1.sinks.k2.port = 4142
    
    # Describe the channel
    a1.channels.c1.type = memory
    a1.channels.c1.capacity = 1000
    a1.channels.c1.transactionCapacity = 100
    
    a1.channels.c2.type = memory
    a1.channels.c2.capacity = 1000
    a1.channels.c2.transactionCapacity = 100
    
    # Bind the source and sink to the channel
    a1.sources.r1.channels = c1 c2
    a1.sinks.k1.channel = c1
    a1.sinks.k2.channel = c2
    
    
    
  • 创建flume2-flume-hdfs.conf

    # Name the components on this agent
    a2.sources = r1
    a2.sinks = k1
    a2.channels = c1
    
    # Describe/configure the source
    a2.sources.r1.type = avro
    a2.sources.r1.bind = linux01  # 需要和第一个文件一致
    a2.sources.r1.port = 4141
    
    # Describe the sink
    a2.sinks.k1.type = hdfs
    a2.sinks.k1.hdfs.path = hdfs://linux01:8020/flume2/%Y%m%d/%H
    #上传文件的前缀
    a2.sinks.k1.hdfs.filePrefix = flume2-
    #是否按照时间滚动文件夹
    a2.sinks.k1.hdfs.round = true
    #多少时间单位创建一个新的文件夹
    a2.sinks.k1.hdfs.roundValue = 1
    #重新定义时间单位
    a2.sinks.k1.hdfs.roundUnit = hour
    #是否使用本地时间戳
    a2.sinks.k1.hdfs.useLocalTimeStamp = true
    #积攒多少个Event才flush到HDFS一次
    a2.sinks.k1.hdfs.batchSize = 100
    #设置文件类型,可支持压缩
    a2.sinks.k1.hdfs.fileType = DataStream
    #多久生成一个新的文件
    a2.sinks.k1.hdfs.rollInterval = 600
    #设置每个文件的滚动大小大概是128M
    a2.sinks.k1.hdfs.rollSize = 134217700
    #文件的滚动与Event数量无关
    a2.sinks.k1.hdfs.rollCount = 0
    #最小冗余数
    a2.sinks.k1.hdfs.minBlockReplicas = 1
    
    # Describe the channel
    a2.channels.c1.type = memory
    a2.channels.c1.capacity = 1000
    a2.channels.c1.transactionCapacity = 100
    
    # Bind the source and sink to the channel
    a2.sources.r1.channels = c1
    a2.sinks.k1.channel = c1
    
    
  • 创建flume3-flume-dir.conf

    # Name the components on this agent
    a3.sources = r1
    a3.sinks = k1
    a3.channels = c1
    
    # Describe/configure the source
    a3.sources.r1.type = avro
    a3.sources.r1.bind = linux01
    a3.sources.r1.port = 4142
    
    # Describe the sink
    a3.sinks.k1.type = file_roll   # 实时滚动创建文件,默认每30秒生成一个
    a3.sinks.k1.sink.directory = /home/admin/Desktop/flume3 #目录必须存在
    
    # Describe the channel
    a3.channels.c1.type = memory
    a3.channels.c1.capacity = 1000
    a3.channels.c1.transactionCapacity = 100
    
    # Bind the source and sink to the channel
    a3.sources.r1.channels = c1
    a3.sinks.k1.channel = c1
    
    
  • 启动

    $ bin/flume-ng agent --conf /flume/conf --conf-file flume3-flume-dir.conf --name a3
    $ bin/flume-ng agent --conf /flume/conf --conf-file flume2-flume-hdfs.conf --name a2
    $ bin/flume-ng agent --conf /flume/conf --conf-file flume1-file-flume.conf --name a1
    
    

1.4多数据源汇总案例

案例需求:

hadoop3上的flume-1监控文件/opt/module/group.log

hadoop2上的flume-2监控某一个端口的数据流

flume-1与flume-2将数据发送给hadoop4上的flume-3,flume-3将最终数据打印到控制台

在这里插入图片描述

  1. hadoop3上创建flume1-logger-flume.conf

    # Name the components on this agent
    a1.sources = r1
    a1.sinks = k1
    a1.channels = c1
    
    # Describe/configure the source
    a1.sources.r1.type = exec
    a1.sources.r1.command = tail -F /home/admin/modules/apache-hive-1.2.2-bin/hive.log
    a1.sources.r1.shell = /bin/bash -c
    
    # Describe the sink
    a1.sinks.k1.type = avro
    a1.sinks.k1.hostname = hadoop4  #将数据汇总到hadoop4
    a1.sinks.k1.port = 4141
    
    # Describe the channel
    a1.channels.c1.type = memory
    a1.channels.c1.capacity = 1000
    a1.channels.c1.transactionCapacity = 100
    
    # Bind the source and sink to the channel
    a1.sources.r1.channels = c1
    a1.sinks.k1.channel = c1
    
    
  2. hadoop2上创建flume2-netcat-flume.conf

    # Name the components on this agent
    a2.sources = r1
    a2.sinks = k1
    a2.channels = c1
    
    # Describe/configure the source
    a2.sources.r1.type = netcat
    a2.sources.r1.bind = hadoop2
    a2.sources.r1.port = 44444
    
    # Describe the sink
    a2.sinks.k1.type = avro
    a2.sinks.k1.hostname = hadoop4
    a2.sinks.k1.port = 4141
    
    # Use a channel which buffers events in memory
    a2.channels.c1.type = memory
    a2.channels.c1.capacity = 1000
    a2.channels.c1.transactionCapacity = 100
    
    # Bind the source and sink to the channel
    a2.sources.r1.channels = c1
    a2.sinks.k1.channel = c1
    
    
    
  3. hadoop4上创建flume3-flume-logger.conf

    # Name the components on this agent
    a3.sources = r1
    a3.sinks = k1
    a3.channels = c1
    
    # Describe/configure the source
    a3.sources.r1.type = avro
    a3.sources.r1.bind = hadoop4
    a3.sources.r1.port = 4141
    
    # Describe the sink
    a3.sinks.k1.type = logger
    
    
    # Use a channel which buffers events in memory
    a3.channels.c1.type = memory
    a3.channels.c1.capacity = 1000
    a3.channels.c1.transactionCapacity = 100
    
    # Bind the source and sink to the channel
    a3.sources.r1.channels = c1
    a3.sinks.k1.channel = c1
    
    
    
  4. 启动

    #在hadoop4控制台输出打印
    hadoop4>>$ bin/flume-ng agent --conf /flume/conf --conf-file flume3-flume-logger.conf --name a3 -Dflume.root.logger=INFO,console
    hadoop2>>$ bin/flume-ng agent --conf /flume/conf --conf-file flume2-netcat-flume.conf --name a2 
    hadoop3>>$ bin/flume-ng agent --conf /flume/conf --conf-file flume1-logger-flume.conf --name a1
    
    

1.5 flume对接kafka

1.创建flume-flume-kafka.conf

#Name
a1.sources = r1
a1.channels = c1
a1.sinks = k1
#Source
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444

#Channel
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

#sink
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.kafka.topic = first  				## 话题
a1.sinks.k1.kafka.bootstrap.servers = hadoop1:9092,hadoop2:9092,hadoop3:9092     ## kafka的服务器地址
a1.sinks.k1.kafka.flumeBatchSize = 20
a1.sinks.k1.kafka.producer.acks = 1
a1.sinks.k1.kafka.producer.linger.ms = 1

#Bind
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

2.启动消费者

3.启动flume

Flume数据传输监控,使用第三方框架Ganglia

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值