flume常用功能总结

1.通过netcat作为source,sink为logger的方式
flume指令:
./bin/flume-ng agent --conf conf --conf-file ./conf/example.conf -name a1 -Dflume.root.logger=INFO,console

发送端:[root@master ~]# telnet localhost 44444
在这里插入图片描述
在这里插入图片描述
flume接收:

2021-01-21 22:15:39,225 (SinkRunner-PollingRunner-DefaultSinkProcessor) [INFO - org.apache.flume.sink.LoggerSink.process(LoggerSink.java:94)] Event: { headers:{} body: 31 0D                                           1. }
2021-01-21 22:15:39,225 (SinkRunner-PollingRunner-DefaultSinkProcessor) [INFO - org.apache.flume.sink.LoggerSink.process(LoggerSink.java:94)] Event: { headers:{} body: 31 0D                                           1. }
2021-01-21 22:15:39,226 (SinkRunner-PollingRunner-DefaultSinkProcessor) [INFO - org.apache.flume.sink.LoggerSink.process(LoggerSink.java:94)] Event: { headers:{} body: 31 0D                                           1. }

对应conf配置文件说明:

#Name the components on this agent 针对agent重命名为a1
a1.sources = r1 # source别名为 r1
a1.sinks = k1 # sinks别名为 k1
a1.channels = c1 # channel别名为 c1

# Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444

# source定义正则匹配规则
#a1.sources.r1.interceptors = i1  
#a1.sources.r1.interceptors.i1.type =regex_filter  
#a1.sources.r1.interceptors.i1.regex =^[0-9]*$  
#a1.sources.r1.interceptors.i1.excludeEvents =true

# Describe the sink
 a1.sinks.k1.type = logger
#a1.channels = c1
#a1.sinks = k1
# sink为hdfs
#a1.sinks.k1.type = hdfs
# 数据存储到hdfs的文件路径
#a1.sinks.k1.hdfs.path = hdfs:/flume
# 表示最终的文件前缀
#a1.sinks.k1.hdfs.filePrefix = events
# 表示到了需要触发的时间时,是否要更新文件夹,true:表示是
#a1.sinks.k1.hdfs.round = true
#a1.sinks.k1.hdfs.roundValue = 10
# 表示切换时间的单位是分钟
#a1.sinks.k1.hdfs.roundUnit = minute
# 表示过了一分钟生成一个文件
#a1.sinks.k1.hdfs.roundInterval = 60 
#a1.sinks.k1.hdfs.fileType = DataStream

# Use a channel which buffers events in 
# memory channel配置
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
# 将source channel sink进行串联起来
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

加入正则表达式的conf配置:

#Name the components on this agent 针对agent重命名为a1
a1.sources = r1 # source别名为 r1
a1.sinks = k1 # sinks别名为 k1
a1.channels = c1 # channel别名为 c1

# Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444

# source定义正则匹配规则
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type =regex_filter
a1.sources.r1.interceptors.i1.regex =^[0-9]*$
a1.sources.r1.interceptors.i1.excludeEvents =true

# Describe the sink
 a1.sinks.k1.type = logger
#a1.channels = c1
#a1.sinks = k1
# sink为hdfs
#a1.sinks.k1.type = hdfs
# 数据存储到hdfs的文件路径
#a1.sinks.k1.hdfs.path = hdfs:/flume
# 表示最终的文件前缀
#a1.sinks.k1.hdfs.filePrefix = events
# 表示到了需要触发的时间时,是否要更新文件夹,true:表示是
#a1.sinks.k1.hdfs.round = true
#a1.sinks.k1.hdfs.roundValue = 10
# 表示切换时间的单位是分钟
#a1.sinks.k1.hdfs.roundUnit = minute
# 表示过了一分钟生成一个文件
#a1.sinks.k1.hdfs.roundInterval = 60 
#a1.sinks.k1.hdfs.fileType = DataStream

# Use a channel which buffers events in 
# memory channel配置
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
# 将source channel sink进行串联起来
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

测试发送数据:

[root@master ~]# 
[root@master ~]# telnet localhost 44444
Trying ::1...
telnet: connect to address ::1: Connection refused
Trying 127.0.0.1...
Connected to localhost.
Escape character is '^]'.
a
OK
b
OK
b
OK
c
OK
1
OK
2
OK
3
OK

接收过滤后的数据:数字被过滤掉:

2021-01-21 22:20:47,799 (lifecycleSupervisor-1-5) [INFO - org.apache.flume.source.NetcatSource.start(NetcatSource.java:164)] Created serverSocket:sun.nio.ch.ServerSocketChannelImpl[/127.0.0.1:44444]
2021-01-21 22:21:03,633 (SinkRunner-PollingRunner-DefaultSinkProcessor) [INFO - org.apache.flume.sink.LoggerSink.process(LoggerSink.java:94)] Event: { headers:{} body: 61 0D                                           a. }
2021-01-21 22:21:04,374 (SinkRunner-PollingRunner-DefaultSinkProcessor) [INFO - org.apache.flume.sink.LoggerSink.process(LoggerSink.java:94)] Event: { headers:{} body: 62 0D                                           b. }
2021-01-21 22:21:05,517 (SinkRunner-PollingRunner-DefaultSinkProcessor) [INFO - org.apache.flume.sink.LoggerSink.process(LoggerSink.java:94)] Event: { headers:{} body: 62 0D                                           b. }
2021-01-21 22:21:06,133 (SinkRunner-PollingRunner-DefaultSinkProcessor) [INFO - org.apache.flume.sink.LoggerSink.process(LoggerSink.java:94)] Event: { headers:{} body: 63 0D     

2.通过netcat作为source, sink写到hdfs
./bin/flume-ng agent --conf conf --conf-file ./conf/example.conf -name a1 -Dflume.root.logger=INFO,console
conf配置:屏蔽logger 打开hdfs hdfs 路径

#Name the components on this agent 针对agent重命名为a1
a1.sources = r1 # source别名为 r1
a1.sinks = k1 # sinks别名为 k1
a1.channels = c1 # channel别名为 c1

# Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444

# source定义正则匹配规则
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type =regex_filter
a1.sources.r1.interceptors.i1.regex =^[0-9]*$
a1.sources.r1.interceptors.i1.excludeEvents =true

# Describe the sink
# a1.sinks.k1.type = logger
a1.channels = c1
a1.sinks = k1
# sink为hdfs
a1.sinks.k1.type = hdfs
# 数据存储到hdfs的文件路径
a1.sinks.k1.hdfs.path = hdfs:/flume
# 表示最终的文件前缀
a1.sinks.k1.hdfs.filePrefix = events
# 表示到了需要触发的时间时,是否要更新文件夹,true:表示是
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 10
# 表示切换时间的单位是分钟
a1.sinks.k1.hdfs.roundUnit = minute
# 表示过了一分钟生成一个文件
a1.sinks.k1.hdfs.roundInterval = 60
a1.sinks.k1.hdfs.fileType = DataStream

# Use a channel which buffers events in 
# memory channel配置
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
# 将source channel sink进行串联起来
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

对应执行结果:

2021-01-21 22:29:54,443 (lifecycleSupervisor-1-3) [INFO - org.apache.flume.source.NetcatSource.start(NetcatSource.java:150)] Source starting
2021-01-21 22:29:54,456 (lifecycleSupervisor-1-3) [INFO - org.apache.flume.source.NetcatSource.start(NetcatSource.java:164)] Created serverSocket:sun.nio.ch.ServerSocketChannelImpl[/127.0.0.1:44444]
2021-01-21 22:30:11,761 (SinkRunner-PollingRunner-DefaultSinkProcessor) [INFO - org.apache.flume.sink.hdfs.HDFSDataStream.configure(HDFSDataStream.java:58)] Serializer = TEXT, UseRawLocalFileSystem = false
2021-01-21 22:30:12,016 (SinkRunner-PollingRunner-DefaultSinkProcessor) [INFO - org.apache.flume.sink.hdfs.BucketWriter.open(BucketWriter.java:234)] Creating hdfs:/flume/events.1611239411762.tmp
2021-01-21 22:30:43,060 (hdfs-k1-roll-timer-0) [INFO - org.apache.flume.sink.hdfs.BucketWriter.close(BucketWriter.java:363)] Closing hdfs:/flume/events.1611239411762.tmp
2021-01-21 22:30:43,116 (hdfs-k1-call-runner-7) [INFO - org.apache.flume.sink.hdfs.BucketWriter$8.call(BucketWriter.java:629)] Renaming hdfs:/flume/events.1611239411762.tmp to hdfs:/flume/events.1611239411762
2021-01-21 22:30:43,132 (hdfs-k1-roll-timer-0) [INFO - org.apache.flume.sink.hdfs.HDFSEventSink$1.run(HDFSEventSink.java:394)] Writer callback called.
root@master ~]# telnet localhost 44444
Trying ::1...
telnet: connect to address ::1: Connection refused
Trying 127.0.0.1...
Connected to localhost.
Escape character is '^]'.
11
OK

OK

OK

OK
2
OK

OK
2
OK
3434342
OK
fs
OK
g
OK
ss
OK
fs
OK
[root@master ~]# hadoop fs -ls /flume
Found 2 items
-rw-r--r--   1 root supergroup         15 2020-06-09 07:34 /flume/events.1591659216989
-rw-r--r--   1 root supergroup         15 2021-01-21 22:30 /flume/events.1611239411762
[root@master ~]# 
[root@master ~]# 
[root@master ~]# hadoop fs -cat /flume/events.1611239411762
fs
g
ss
fs

3.通过netcat作为source, sink写到hdfs如何设置flume防止小文件过多?
a、限定一个文件的文件数据大小
a1.sinks.k1.hdfs.rollSize = 20010241024
b、限定文件可以存储多少个event
a1.sinks.k1.hdfs.rollCount = 10000

conf配置文件:

#Name the components on this agent 针对agent重命名为a1
a1.sources = r1 # source别名为 r1
a1.sinks = k1 # sinks别名为 k1
a1.channels = c1 # channel别名为 c1

# Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444

# source定义正则匹配规则
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type =regex_filter
a1.sources.r1.interceptors.i1.regex =^[0-9]*$
a1.sources.r1.interceptors.i1.excludeEvents =true

# Describe the sink
# a1.sinks.k1.type = logger
a1.channels = c1
a1.sinks = k1
# sink为hdfs
a1.sinks.k1.type = hdfs
#
#
#Name the components on this agent 针对agent重命名为a1
a1.sources = r1 # source别名为 r1
a1.sinks = k1 # sinks别名为 k1
a1.channels = c1 # channel别名为 c1

# Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444

# source定义正则匹配规则
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type =regex_filter
a1.sources.r1.interceptors.i1.regex =^[0-9]*$
a1.sources.r1.interceptors.i1.excludeEvents =true

# Describe the sink
# a1.sinks.k1.type = logger
a1.channels = c1
a1.sinks = k1
# sink为hdfs
a1.sinks.k1.type = hdfs
# 数据存储到hdfs的文件路径
a1.sinks.k1.hdfs.path = hdfs:/flume
# 表示最终的文件前缀
a1.sinks.k1.hdfs.filePrefix = events
# 表示到了需要触发的时间时,是否要更新文件夹,true:表示是
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 10
# 表示切换时间的单位是分钟
a1.sinks.k1.hdfs.roundUnit = minute
# 表示过了一分钟生成一个文件
a1.sinks.k1.hdfs.roundInterval = 60
a1.sinks.k1.hdfs.fileType = DataStream

# Use a channel which buffers events in 
# memory channel配置
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
# 将source channel sink进行串联起来
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

每隔一分钟生成一个新的文件

./bin/flume-ng agent --conf conf --conf-file ./conf/example.conf -name a1 -Dflume.root.logger=INFO,console
[root@master ~]# 
[root@master ~]# telnet localhost 44444
Trying ::1...
telnet: connect to address ::1: Connection refused
Trying 127.0.0.1...
Connected to localhost.
Escape character is '^]'.
hello
OK
zjpi^H^H
OK
zhouwf
OK

查看结果:

[root@master ~]# 
[root@master ~]# hadoop fs -cat /flume/events.1611585310928
hello
zjpi
zhouwf

4:通过HTTP作为source, sink写到logger
./bin/flume-ng agent --conf conf --conf-file ./conf/header_test.conf -name a1 -Dflume.root.logger=INFO,console

conf配置如下:

#name the components on this agent
agent.sources = r1
agent.sinks = k1
agent.channels = c1
# Describe/configuration the source
agent.sources.r1.type = org.apache.flume.source.http.HTTPSource
agent.sources.r1.bind = master
agent.sources.r1.port =9989
#agent.sources.r1.fileHeader = true

#agent.sources.r1.interceptors=i1
#agent.sources.r1.i1.types=regex_filter
#agent.sources.r1.interceptors.regex=^[0-9]*$
#agent.sources.r1.interceptors.i1.excludeEvents=true

# Describe the sink
agent.sinks.k1.type = logger

# Use a channel which buffers events in memory
agent.channels.c1.type = memory
agent.channels.c1.capacity = 1000
agent.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
agent.sources.r1.channels = c1
agent.sinks.k1.channel = c1

启动:
./bin/flume-ng agent -c conf -f conf/pull.conf -n a2 -Dflume.root.logger=INFO,console

在master上配置push.conf:

#Name the components on this agent
a1.sources= r1
a1.sinks= k1
a1.channels= c1

#Describe/configure the source
a1.sources.r1.type= netcat
a1.sources.r1.bind= localhost
a1.sources.r1.port = 44444
a1.sources.r1.channels= c1

#Use a channel which buffers events in memory
a1.channels.c1.type= memory
a1.channels.c1.keep-alive= 10
a1.channels.c1.capacity= 100000
a1.channels.c1.transactionCapacity= 100000

#Describe/configure the source
a1.sinks.k1.type= avro #注意这里
a1.sinks.k1.channel= c1
a1.sinks.k1.hostname= slave2 #注意这里
a1.sinks.k1.port= 44444

启动:
./bin/flume-ng agent -c conf -f conf/push.conf -n a1 -Dflume.root.logger=INFO,console

在master节点:
在这里插入图片描述
slave2接收到:
在这里插入图片描述

2021-01-25 11:13:13,744 (lifecycleSupervisor-1-4) [INFO - org.apache.flume.source.AvroSource.start(AvroSource.java:253)] Avro source r1 started.

2021-01-25 11:13:42,400 (New I/O server boss #1 ([id: 0xa6a799e4, /192.168.96.12:44444])) [INFO - org.apache.avro.ipc.NettyServer$NettyServerAvroHandler.handleUpstream(NettyServer.java:171)] [id: 0x4e222782, /192.168.96.10:49504 => /192.168.96.12:44444] OPEN
2021-01-25 11:13:42,402 (New I/O  worker #1) [INFO - org.apache.avro.ipc.NettyServer$NettyServerAvroHandler.handleUpstream(NettyServer.java:171)] [id: 0x4e222782, /192.168.96.10:49504 => /192.168.96.12:44444] BOUND: /192.168.96.12:44444
2021-01-25 11:13:42,402 (New I/O  worker #1) [INFO - org.apache.avro.ipc.NettyServer$NettyServerAvroHandler.handleUpstream(NettyServer.java:171)] [id: 0x4e222782, /192.168.96.10:49504 => /192.168.96.12:44444] CONNECTED: /192.168.96.10:49504
2021-01-25 11:15:03,432 (SinkRunner-PollingRunner-DefaultSinkProcessor) [INFO - org.apache.flume.sink.LoggerSink.process(LoggerSink.java:94)] Event: { headers:{} body: 68 65 6C 6C 6F 0D                               hello. }
2021-01-25 11:15:26,434 (SinkRunner-PollingRunner-DefaultSinkProcessor) [INFO - org.apache.flume.sink.LoggerSink.process(LoggerSink.java:94)] Event: { headers:{} body: 31 0D                                           1. }
2021-01-25 11:15:26,434 (SinkRunner-PollingRunner-DefaultSinkProcessor) [INFO - org.apache.flume.sink.LoggerSink.process(LoggerSink.java:94)] Event: { headers:{} body: 32 0D                                           2. }
2021-01-25 11:15:26,435 (SinkRunner-PollingRunner-DefaultSinkProcessor) [INFO - org.apache.flume.sink.LoggerSink.process(LoggerSink.java:94)] Event: { headers:{} body: 33 0D                                           3. }
2021-01-25 11:15:26,435 (SinkRunner-PollingRunner-DefaultSinkProcessor) [INFO - org.apache.flume.sink.LoggerSink.process(LoggerSink.java:94)] Event: { headers:{} body: 34 0D                                           4. }
2021-01-25 11:15:26,435 (SinkRunner-PollingRunner-DefaultSinkProcessor) [INFO - org.apache.flume.sink.LoggerSink.process(LoggerSink.java:94)] Event: { headers:{} body: 35 0D                                           5. }
2021-01-25 11:15:26,435 (SinkRunner-PollingRunner-DefaultSinkProcessor) [INFO - org.apache.flume.sink.LoggerSink.process(LoggerSink.java:94)] Event: { headers:{} body: 66 73 0D                                        fs. }
^Z
  1. flume和kafka对接
    第一步:
    在三个节点启动zookeeper: ./zkServer.sh start
[root@slave1 ~]# cd $ZOOKEEPER_HOME
[root@slave1 zookeeper-3.4.11]# 
[root@slave1 zookeeper-3.4.11]# ./bin/zkServer.sh start
ZooKeeper JMX enabled by default
Using config: /usr/local/src/zookeeper-3.4.11/bin/../conf/zoo.cfg
Starting zookeeper ... STARTED
[root@slave1 zookeeper-3.4.11]# 
[root@slave1 zookeeper-3.4.11]# 
[root@slave1 zookeeper-3.4.11]# jps
1890 QuorumPeerMain
1638 DataNode
1704 NodeManager
1919 Jps
[root@slave1 zookeeper-3.4.11]# 
[root@slave1 zookeeper-3.4.11]# 
[root@slave1 zookeeper-3.4.11]# ./bin/zkServer.sh status
ZooKeeper JMX enabled by default
Using config: /usr/local/src/zookeeper-3.4.11/bin/../conf/zoo.cfg
Mode: leader
[root@slave1 zookeeper-3.4.11]# 

第二步:在master启动kafka
./bin/kafka-server-start.sh config/server.properties &
此处& 是后台进程的意思
第三步:
查看kafka topic

[root@master kafka_2.11-0.10.2.1]# ./bin/kafka-topics.sh --list --zookeeper localhost:2181
__consumer_offsets
test
[root@master kafka_2.11-0.10.2.1]#

创建kafka主题:
[root@master kafka_2.11-0.10.2.1]# ./bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic zhouwf_0131

消费kafka 主题:
模拟将后端日志写入到日志文件中,由kafka消费掉
消费主题:

[root@master kafka_2.11-0.10.2.1]# 
[root@master kafka_2.11-0.10.2.1]# ./bin/kafka-console-consumer.sh --zookeeper master:2181 --topic zhouwf_0131 --from-beginning

启动flume:

[root@master apache-flume-1.6.0-bin]# ./bin/flume-ng agent --conf conf --conf-file conf/flume_kafka.conf -name a1 -Dflume.root.logger=INFO,console

清空原来的日志文件:
echo ‘’ > flume_exec_test.txt

利用python脚本模拟打入日志文件:
python flume_data_write.py

可以看到kafka开始消费:

{"order_id": 1601212, "user_id": 46476, "eval_set": "prior", "order_number": 7, "order_dow": 0, "hour": 12, "day": 14.0}
{"order_id": 1946194, "user_id": 46476, "eval_set": "test", "order_number": 8, "order_dow": 0, "hour": 12, "day": 7.0}
{"order_id": 2185952, "user_id": 46477, "eval_set": "prior", "order_number": 1, "order_dow": 1, "hour": 23, "day": 0.0}
{"order_id": 1561966, "user_id": 46477, "eval_set": "prior", "order_number": 2, "order_dow": 2, "hour": 14, "day": 8.0}
  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值