Flume基础【事务、Agent内部原理、拓扑结构、几个案例的配置文件】

本文链接：https://blog.csdn.net/weixin_43923463/article/details/124108317

文章目录

一 Flume事务
二 Flume Agent内部原理
三 Flume拓扑结构
四案例

一 Flume事务

事务的作用：把多个操作绑定到一起，使得这些操作要么一起成功，要么一起失败。

Flume事务的作用：保证数据不会丢失。

二 Flume Agent内部原理

在这里插入图片描述

三 Flume拓扑结构

1 简单串联

在这里插入图片描述

2 复制和多路复用

在这里插入图片描述

3 负载均衡和故障转移

在这里插入图片描述

4 聚合

在这里插入图片描述

四案例

1 拦截器

a1.sources = r1
a1.channels = c1
a1.sinks = k1

a1.sources.r1.type = netcat
a1.sources.r1.bind = hadoop101
a1.sources.r1.port = 44444
#设置一个拦截器（用来向headers中添加时间戳）
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = timestamp

a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000


a1.sinks.k1.type = logger


a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

2 串联

#agent1(hadoop102)  netcatsource --> memorychannel --> arvosink
a1.sources = r1
a1.channels = c1
a1.sinks = k1

a1.sources.r1.type = netcat
a1.sources.r1.bind = hadoop102
a1.sources.r1.port = 22222

a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000


a1.sinks.k1.type = avro
#hostname是将数据写到的那台机器
a1.sinks.k1.hostname = hadoop103
a1.sinks.k1.port = 33333


a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

----------------------------------------------------------
#注意：先启动hadoop103因为hadoop103接收hadoop102的数据的
#agent2(hadoop103) avrosource ---> memorychannel ----> loggersink
a1.sources = r1
a1.channels = c1
a1.sinks = k1

a1.sources.r1.type = avro
a1.sources.r1.bind = hadoop103
a1.sources.r1.port = 33333

a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000

a1.sinks.k1.type = logger

a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

3 复制和多路复用

#agent1(hadoop101)
a1.sources = r1
a1.channels = c1 c2
a1.sinks = k1 k2

a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /opt/module/flume/demo/123.log

#配置channelSelector - replicating(复制-默认,不配也可以)
#a1.sources.r1.selector.type = replicating

#复用
a1.sources.r1.selector.type = multiplexing
#event(headers | body)根据headers中的key和value进行数据的发送
#state指的是headers中key的值
a1.sources.r1.selector.header = state
#CZ指的是headers中key对应的value值，是CZ发送到c1
a1.sources.r1.selector.mapping.CZ = c1
#US指的是headers中key对应的value值，是US发送到c2
a1.sources.r1.selector.mapping.US = c2

#需求：给event中的headers添加数据
#static拦截器可以给所有的eventheaders设置我们自定义的key和value
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = static
#设置key值
a1.sources.r1.interceptors.i1.key = state
#设置value值
a1.sources.r1.interceptors.i1.value = CZ


a1.channels.c1.type = memory
a1.channels.c2.type = memory


a1.sinks.k1.type = avro
a1.sinks.k1.hostname = hadoop102
a1.sinks.k1.port = 22222
a1.sinks.k2.type = avro
a1.sinks.k2.hostname = hadoop103
a1.sinks.k2.port = 33333

#一个source对接两个channel
a1.sources.r1.channels = c1 c2 
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c2

-----------------------------------------------------------------

#agent2(hadoop102)
a1.sources = r1
a1.channels = c1
a1.sinks = k1

a1.sources.r1.type = avro
a1.sources.r1.bind = hadoop102
a1.sources.r1.port = 22222

a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000

a1.sinks.k1.type = logger

a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1


-----------------------------------------------------------------

#agent3(hadoop103)
a1.sources = r1
a1.channels = c1
a1.sinks = k1

a1.sources.r1.type = avro
a1.sources.r1.bind = hadoop103
a1.sources.r1.port = 333333

a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000

#a1.sinks.k1.type = logger
#将event数据存储到本地磁盘上，很少向本地磁盘存储数据
a1.sinks.k1.type = file_roll
#event存放的目录
a1.sinks.k1.sink.directory = /opt/module/flume/demo
#多久时间滚动一个新文件（30秒）
a1.sinks.k1.sink.rollInterval = 30

a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

4 负载均衡和故障转移

（1）故障转移

#agent1(hadoop101)
a1.sources = r1
a1.channels = c1
a1.sinks = k1 k2

a1.sources.r1.type = netcat
a1.sources.r1.bind = hadoop101
a1.sources.r1.port = 11111

#一个channel对应多个sink时要设置一个sinkgroups
a1.sinkgroups = g1
#该sink组有哪些sink的实例
a1.sinkgroups.g1.sinks = k1 k2
#配置sinkProcessor的类型 1、failover故障转移 2、load_balance负载均衡
a1.sinkgroups.g1.processor.type = failover
#配置sink的优先级，数值越大优先级越高
a1.sinkgroups.g1.processor.priority.k1 = 5
a1.sinkgroups.g1.processor.priority.k2 = 10

a1.channels.c1.type = memory

a1.sinks.k1.type = avro
a1.sinks.k1.hostname = hadoop102
a1.sinks.k1.port = 22222

a1.sinks.k2.type = avro
a1.sinks.k2.hostname = hadoop103
a1.sinks.k2.port = 33333

a1.sources.r1.channels = c1 
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c1

-----------------------------
#agent2(hadoop102)
a1.sources = r1
a1.channels = c1
a1.sinks = k1

a1.sources.r1.type = avro
a1.sources.r1.bind = hadoop102
a1.sources.r1.port = 22222

a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000

a1.sinks.k1.type = logger

a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

--------------------------------
#agent3(hadoop103)
a1.sources = r1
a1.channels = c1
a1.sinks = k1

a1.sources.r1.type = avro
a1.sources.r1.bind = hadoop103
a1.sources.r1.port = 33333

a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000

a1.sinks.k1.type = logger

a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

（2）负载均衡

#agent1(hadoop101)
a1.sources = r1
a1.channels = c1
a1.sinks = k1 k2

a1.sources.r1.type = netcat
a1.sources.r1.bind = hadoop101
a1.sources.r1.port = 11111

#定义一个sink组
a1.sinkgroups = g1
#指明sink组中的sink实例
a1.sinkgroups.g1.sinks = k1 k2
#设置sinkProcessor的类型（负载均衡）
a1.sinkgroups.g1.processor.type = load_balance
# 1 random-随机分配  2 round_robin-轮循
a1.sinkgroups.g1.processor.selector = random

a1.channels.c1.type = memory

a1.sinks.k1.type = avro
a1.sinks.k1.hostname = hadoop102
a1.sinks.k1.port = 22222

a1.sinks.k2.type = avro
a1.sinks.k2.hostname = hadoop103
a1.sinks.k2.port = 33333

a1.sources.r1.channels = c1 
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c1

-----------------------------
#agent2(hadoop102)
a1.sources = r1
a1.channels = c1
a1.sinks = k1

a1.sources.r1.type = avro
a1.sources.r1.bind = hadoop102
a1.sources.r1.port = 22222

a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000

a1.sinks.k1.type = logger

a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1


--------------------------------
#agent3(hadoop103)
a1.sources = r1
a1.channels = c1
a1.sinks = k1

a1.sources.r1.type = avro
a1.sources.r1.bind = hadoop103
a1.sources.r1.port = 33333

a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000

a1.sinks.k1.type = logger

a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1