文章目录
一 Flume事务
事务的作用:把多个操作绑定到一起,使得这些操作要么一起成功,要么一起失败。
Flume事务的作用:保证数据不会丢失。
二 Flume Agent内部原理
三 Flume拓扑结构
1 简单串联
2 复制和多路复用
3 负载均衡和故障转移
4 聚合
四 案例
1 拦截器
a1.sources = r1
a1.channels = c1
a1.sinks = k1
a1.sources.r1.type = netcat
a1.sources.r1.bind = hadoop101
a1.sources.r1.port = 44444
#设置一个拦截器(用来向headers中添加时间戳)
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = timestamp
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.sinks.k1.type = logger
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
2 串联
#agent1(hadoop102) netcatsource --> memorychannel --> arvosink
a1.sources = r1
a1.channels = c1
a1.sinks = k1
a1.sources.r1.type = netcat
a1.sources.r1.bind = hadoop102
a1.sources.r1.port = 22222
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.sinks.k1.type = avro
#hostname是将数据写到的那台机器
a1.sinks.k1.hostname = hadoop103
a1.sinks.k1.port = 33333
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
----------------------------------------------------------
#注意:先启动hadoop103因为hadoop103接收hadoop102的数据的
#agent2(hadoop103) avrosource ---> memorychannel ----> loggersink
a1.sources = r1
a1.channels = c1
a1.sinks = k1
a1.sources.r1.type = avro
a1.sources.r1.bind = hadoop103
a1.sources.r1.port = 33333
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.sinks.k1.type = logger
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
3 复制和多路复用
#agent1(hadoop101)
a1.sources = r1
a1.channels = c1 c2
a1.sinks = k1 k2
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /opt/module/flume/demo/123.log
#配置channelSelector - replicating(复制-默认,不配也可以)
#a1.sources.r1.selector.type = replicating
#复用
a1.sources.r1.selector.type = multiplexing
#event(headers | body)根据headers中的key和value进行数据的发送
#state指的是headers中key的值
a1.sources.r1.selector.header = state
#CZ指的是headers中key对应的value值,是CZ发送到c1
a1.sources.r1.selector.mapping.CZ = c1
#US指的是headers中key对应的value值,是US发送到c2
a1.sources.r1.selector.mapping.US = c2
#需求:给event中的headers添加数据
#static拦截器可以给所有的eventheaders设置我们自定义的key和value
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = static
#设置key值
a1.sources.r1.interceptors.i1.key = state
#设置value值
a1.sources.r1.interceptors.i1.value = CZ
a1.channels.c1.type = memory
a1.channels.c2.type = memory
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = hadoop102
a1.sinks.k1.port = 22222
a1.sinks.k2.type = avro
a1.sinks.k2.hostname = hadoop103
a1.sinks.k2.port = 33333
#一个source对接两个channel
a1.sources.r1.channels = c1 c2
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c2
-----------------------------------------------------------------
#agent2(hadoop102)
a1.sources = r1
a1.channels = c1
a1.sinks = k1
a1.sources.r1.type = avro
a1.sources.r1.bind = hadoop102
a1.sources.r1.port = 22222
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.sinks.k1.type = logger
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
-----------------------------------------------------------------
#agent3(hadoop103)
a1.sources = r1
a1.channels = c1
a1.sinks = k1
a1.sources.r1.type = avro
a1.sources.r1.bind = hadoop103
a1.sources.r1.port = 333333
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
#a1.sinks.k1.type = logger
#将event数据存储到本地磁盘上,很少向本地磁盘存储数据
a1.sinks.k1.type = file_roll
#event存放的目录
a1.sinks.k1.sink.directory = /opt/module/flume/demo
#多久时间滚动一个新文件(30秒)
a1.sinks.k1.sink.rollInterval = 30
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
4 负载均衡和故障转移
(1)故障转移
#agent1(hadoop101)
a1.sources = r1
a1.channels = c1
a1.sinks = k1 k2
a1.sources.r1.type = netcat
a1.sources.r1.bind = hadoop101
a1.sources.r1.port = 11111
#一个channel对应多个sink时要设置一个sinkgroups
a1.sinkgroups = g1
#该sink组有哪些sink的实例
a1.sinkgroups.g1.sinks = k1 k2
#配置sinkProcessor的类型 1、failover故障转移 2、load_balance负载均衡
a1.sinkgroups.g1.processor.type = failover
#配置sink的优先级,数值越大优先级越高
a1.sinkgroups.g1.processor.priority.k1 = 5
a1.sinkgroups.g1.processor.priority.k2 = 10
a1.channels.c1.type = memory
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = hadoop102
a1.sinks.k1.port = 22222
a1.sinks.k2.type = avro
a1.sinks.k2.hostname = hadoop103
a1.sinks.k2.port = 33333
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c1
-----------------------------
#agent2(hadoop102)
a1.sources = r1
a1.channels = c1
a1.sinks = k1
a1.sources.r1.type = avro
a1.sources.r1.bind = hadoop102
a1.sources.r1.port = 22222
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.sinks.k1.type = logger
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
--------------------------------
#agent3(hadoop103)
a1.sources = r1
a1.channels = c1
a1.sinks = k1
a1.sources.r1.type = avro
a1.sources.r1.bind = hadoop103
a1.sources.r1.port = 33333
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.sinks.k1.type = logger
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
(2)负载均衡
#agent1(hadoop101)
a1.sources = r1
a1.channels = c1
a1.sinks = k1 k2
a1.sources.r1.type = netcat
a1.sources.r1.bind = hadoop101
a1.sources.r1.port = 11111
#定义一个sink组
a1.sinkgroups = g1
#指明sink组中的sink实例
a1.sinkgroups.g1.sinks = k1 k2
#设置sinkProcessor的类型(负载均衡)
a1.sinkgroups.g1.processor.type = load_balance
# 1 random-随机分配 2 round_robin-轮循
a1.sinkgroups.g1.processor.selector = random
a1.channels.c1.type = memory
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = hadoop102
a1.sinks.k1.port = 22222
a1.sinks.k2.type = avro
a1.sinks.k2.hostname = hadoop103
a1.sinks.k2.port = 33333
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c1
-----------------------------
#agent2(hadoop102)
a1.sources = r1
a1.channels = c1
a1.sinks = k1
a1.sources.r1.type = avro
a1.sources.r1.bind = hadoop102
a1.sources.r1.port = 22222
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.sinks.k1.type = logger
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
--------------------------------
#agent3(hadoop103)
a1.sources = r1
a1.channels = c1
a1.sinks = k1
a1.sources.r1.type = avro
a1.sources.r1.bind = hadoop103
a1.sources.r1.port = 33333
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.sinks.k1.type = logger
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1