flume kafka

1.查看消费情况

kafka-run-class kafka.tools.ConsumerOffsetChecker --group groupname --topic topicname --zookeeper ip1:2181,ip2:2181,ip3:2181

消费情况说明:

Group           Topic                          Pid Offset          logSize         Lag             Owner
消费者组名   话题名    分区id    当前已消费的条数   总条数    未消费的条数

2.flume-1.6 多source,多channel,多sink配置

#source的名字
agent.sources = pc app
# channels的名字,建议按照type来命名
agent.channels = FilePc  FileApp
# sink的名字,建议按照目标来命名
agent.sinks = hdfsSinkPc  hdfsSinkApp

# 指定source使用的channel名字
agent.sources.pc.channels = FilePc
agent.sources.app.channels = FileApp

# 指定sink需要使用的channel的名字,注意这里是channel
agent.sinks.hdfsSinkPc.channel = FilePc
agent.sinks.hdfsSinkApp.channel = FileApp

#-------- kafkaSource相关配置-----------------
# 定义消息源类型
agent.sources.pc.type = org.apache.flume.source.kafka.KafkaSource
agent.sources.app.type = org.apache.flume.source.kafka.KafkaSource
# 定义kafka所在zk的地址
#
# 这里特别注意: 是kafka的zookeeper的地址注意版本
#
agent.sources.pc.zookeeperConnect = 10.15.201.197:2181,10.15.201.198:2181,10.15.201.199:2181
agent.sources.app.zookeeperConnect = 10.15.201.197:2181,10.15.201.198:2181,10.15.201.199:2181
# 配置消费的kafka topic
agent.sources.pc.topic = suggest_pc_action
agent.sources.app.topic = suggest_app_action

# 配置消费者组的id
agent.sources.pc.groupId = flume57_pc
agent.sources.app.groupId = flume57_app
# 消费超时时间,参照如下写法可以配置其他所有kafka的consumer选项。注意格式从kafka.xxx开始是consumer的配置属性
agent.sources.pc.kafka.consumer.timeout.ms = 1000
agent.sources.app.kafka.consumer.timeout.ms = 1000



#------- Channel相关配置-------------------------
# channel类型
agent.channels.FilePc.type = File
agent.channels.FileApp.type = File
# channel存储的事件容量
agent.channels.FilePc.capacity=10000
agent.channels.FileApp.capacity=10000
# 事务容量 
agent.channels.FilePc.transactionCapacity=1000 
agent.channels.FileApp.transactionCapacity=1000

#---------hdfsSink 相关配置------------------
agent.sinks.hdfsSinkPc.type = hdfs
agent.sinks.hdfsSinkApp.type = hdfs
# 注意, 我们输出到下面一个子文件夹datax中
agent.sinks.hdfsSinkPc.hdfs.path = hdfs://namenodeip:8020/xcardata/log/kafka_log/pc_action_log/%Y%m%d%H
agent.sinks.hdfsSinkPc.hdfs.writeFormat = Text
agent.sinks.hdfsSinkPc.hdfs.fileType = DataStream
#默认值:1024,当临时文件达到该大小(单位:bytes)时,滚动成目标文件
agent.sinks.hdfsSinkPc.hdfs.rollSize = 10240000
#当events数据达到该数量时候,将临时文件滚动成目标文件;如果设置成0,则表示不根据events数据来滚动文件;
agent.sinks.hdfsSinkPc.hdfs.rollCount = 0
#默认值:30,hdfs sink间隔多长将临时文件滚动成最终目标文件,单位:秒;如果设置成0,则表示不根据时间来滚动文件;
agent.sinks.hdfsSinkPc.hdfs.rollInterval = 60


agent.sinks.hdfsSinkApp.hdfs.path = hdfs://namenodeip:8020/xcardata/log/kafka_log/app_action_log/%Y%m%d%H
agent.sinks.hdfsSinkApp.hdfs.writeFormat = Text
agent.sinks.hdfsSinkApp.hdfs.fileType = DataStream
agent.sinks.hdfsSinkApp.hdfs.rollSize = 10240000
agent.sinks.hdfsSinkApp.hdfs.rollCount = 0
agent.sinks.hdfsSinkApp.hdfs.rollInterval = 60

#配置前缀和后缀
agent.sinks.hdfsSinkPc.hdfs.filePrefix=pc_action_log
agent.sinks.hdfsSinkPc.hdfs.fileSuffix=.json
agent.sinks.hdfsSinkApp.hdfs.filePrefix=app_action_log
agent.sinks.hdfsSinkApp.hdfs.fileSuffix=.json

#避免文件在关闭前使用临时文件
#agent.sinks.hdfsSink.hdfs.inUserPrefix=_
#agent.sinks.hdfsSink.hdfs.inUserSuffix=

#自定义拦截器
#agent.sources.kafkaSource.interceptors=i1
#agent.sources.kafkaSource.interceptors.i1.type=com.hadoop.flume.FormatInterceptor$Builder
#agent.channels = memoryChannel

#channels,通道目录配置:把文件事件持久化到本地硬盘eDualCheckpoints
#配置多channel备份
#agent.channels.FilePc.useDualCheckpoints=ture
#agent.channels.FileApp.useDualCheckpoints=ture
#agent.channels.FilePc.backupCheckpointDir=/opt/apache-flume-1.6.0-bin/checkpoint2
agent.channels.FilePc.checkpointDir=/opt/apache-flume-1.6.0-bin/checkpoint_Pc
agent.channels.FileApp.checkpointDir=/opt/apache-flume-1.6.0-bin/checkpoint_App
agent.channels.FilePc.dataDirs=/opt/apache-flume-1.6.0-bin/dataDir_Pc
agent.channels.FileApp.dataDirs=/opt/apache-flume-1.6.0-bin/dataDir_App
#日志文件大小,默认2G,
agent.channels.FilePc.maxFileSize=2146435071
agent.channels.FileApp.maxFileSize=2146435071

3.flume agent 启动

#设置http的 metrics
/opt/apache-flume-1.6.0-bin/bin/flume-ng agent -c conf -f conf/flume-conf.properties -n agent -Dflume.monitoring.type=http -Dflume.monitoring.port=5653 -Dflume.root.logger=INFO,console  >>flume_to_hdfs.log 2>&1 &
ip:port/metrics

{
    "SOURCE.app": {
        "KafkaCommitTimer": "528",
        "KafkaEventGetTimer": "161840",
        "EventReceivedCount": "8272",  //source端成功收到的event数量
        "AppendBatchAcceptedCount": "0", //追加到channel中的批数量
        "Type": "SOURCE",
        "EventAcceptedCount": "8272", //成功放入channel的event数量
        "AppendReceivedCount": "0",//source追加目前收到的数量
        "StartTime": "1511322862896",
        "AppendAcceptedCount": "0", //放入channel的event数量
        "OpenConnectionCount": "0",//打开的连接数
        "AppendBatchReceivedCount": "0",//source端刚刚追加的批数量
        "StopTime": "0" //组件停止时间
    },
    "SOURCE.pc": {
        "KafkaCommitTimer": "2117",
        "KafkaEventGetTimer": "536256",
        "EventReceivedCount": "115891",
        "AppendBatchAcceptedCount": "0",
        "Type": "SOURCE",
        "EventAcceptedCount": "115891",
        "AppendReceivedCount": "0",
        "StartTime": "1511322862896",
        "AppendAcceptedCount": "0",
        "OpenConnectionCount": "0",
        "AppendBatchReceivedCount": "0",
        "StopTime": "0"
    },
    "CHANNEL.FilePc": {
        "ChannelCapacity": "10000", //通道容量
        "ChannelFillPercentage": "1.91", //通道使用比例
        "Type": "CHANNEL",
        "ChannelSize": "191", //目前在channel中的event数量
        "EventTakeSuccessCount": "115700", //从channel中成功取走的event数量
        "EventTakeAttemptCount": "116707",//尝试从channel中取走event的次数
        "StartTime": "1511322861765",
        "EventPutSuccessCount": "115891",//成功放入channel的event数量
        "EventPutAttemptCount": "115891",//尝试放入将event放入channel的次数
        "StopTime": "0"
    },
    "SINK.hdfsSinkPc": {
        "ConnectionCreatedCount": "10",//创建连接数
        "ConnectionClosedCount": "9",//关闭连接数量
        "Type": "SINK",
        "BatchCompleteCount": "830",//完成的批数量
        "BatchEmptyCount": "523",//批量取空的数量
        "EventDrainAttemptCount": "115700",//尝试提交的event数量
        "StartTime": "1511322861782",
        "EventDrainSuccessCount": "115700",//成功发送event的数量
        "BatchUnderflowCount": "484",//正处于批量处理的batch数
        "StopTime": "0",
        "ConnectionFailedCount": "0" //连接失败数
    },
    "SINK.hdfsSinkApp": {
        "ConnectionCreatedCount": "10",
        "ConnectionClosedCount": "9",
        "Type": "SINK",
        "BatchCompleteCount": "13",
        "BatchEmptyCount": "288",
        "EventDrainAttemptCount": "8389",
        "StartTime": "1511322861782",
        "EventDrainSuccessCount": "8389",
        "BatchUnderflowCount": "112",
        "StopTime": "0",
        "ConnectionFailedCount": "0"
    },
    "CHANNEL.FileApp": {
        "ChannelCapacity": "10000",
        "ChannelFillPercentage": "0.0",
        "Type": "CHANNEL",
        "ChannelSize": "0",
        "EventTakeSuccessCount": "8389",
        "EventTakeAttemptCount": "8789",
        "StartTime": "1511322861494",
        "EventPutSuccessCount": "8272",
        "EventPutAttemptCount": "8272",
        "StopTime": "0"
    }
}

ganglia方式

        -Dflume.monitoring.type=ganglia  # 默认情况下flume以Ganglia3.1格式报告指标
        -Dflume.monitoring.pollFrequency=45 # 报告间隔时间(秒)
        -Dflume.monitoring.isGanglia3=true # 启用ganglia3个格式报告 
        -Dflume.root.logger=INFO,console

4.flume envet 数据结构

  1. event是flume中处理消息的基本单元,由零个或者多个header和正文body组成。header是一个map,body是一个字节数组,body才是我们实际使用中真正传输的数据,header传输的数据,我们是不会是sink出去的
  2. Header 是 key/value 形式的,可以用来制造路由决策或携带其他结构化信息(如事件的时间戳或事件来源的服务器主机名)。你可以把它想象成和 HTTP 头一样提供相同的功能——通过该方法来传输正文之外的额外信息。
  3. flume允许用户修改event,添加header,从而通过消息的内容对日志进行路由。具体需要使用的机制有:拦截器interceptor和选择器selector;

5.flume Interceptor 自带包括以下类型

Timestamp Interceptor;
Host Interceptor;
Static Interceptor;
UUID Interceptor;
Morphline Interceptor;
Search and Replace Interceptor;
Regex Filtering Interceptor;
Regex Extractor Interceptor;

(1) Interceptor
添加主节点的host到header中

a1.sources = r1  
a1.sinks = k1  
a1.channels = c1  

# Describe/configure the source  
a1.sources.r1.type = syslogtcp  
a1.sources.r1.port = 50000  
a1.sources.r1.host = 192.168.10.2 
a1.sources.r1.channels = c1  

a1.sources.r1.interceptors = i1 
a1.sources.r1.interceptors.i1.type = host
a1.sources.r1.interceptors.i1.useIP = false
a1.sources.r1.interceptors.i1.hostHeader = agentHost

# Describe the sink  
a1.sinks.k1.type = logger   
a1.sinks.k1.channel = c1  


# Use a channel which buffers events inmemory  
a1.channels.c1.type = memory  
a1.channels.c1.capacity = 1000  
a1.channels.c1.transactionCapacity = 100  


启动案列
../bin/flume-ng agent -c conf ../conf -f ../conf/flume-conf-logger.properties  -n a1 -Dflume.root.logger=INFO,console
输入测试数据
Event created from Invalid Syslog data.
2017-11-23 11:47:58,476 INFO  [SinkRunner-PollingRunner-DefaultSinkProcessor] sink.LoggerSink (LoggerSink.java:process(94)) - Event: { headers:{Severity=0, Facility=0, agentHost=hadoop2, flume.syslog.status=Invalid} body: 66 66 66 66 66 20 20 49 6E 74 65 72 63 65 70 74 fffff  Intercept }

可以看到对应的信息已经添加到header中,基于此可以对event在后续的channel的时候使用选择器Selectors

#timestamp类似的
a1.sources.r1.interceptors = i1 
a1.sources.r1.interceptors.i1.type = timestamp
a1.sources.r1.interceptors.i1.preserveExisting= false
#可以看到如下信息
Event created from Invalid Syslog data.
2017-11-23 12:02:33,694 INFO  [SinkRunner-PollingRunner-DefaultSinkProcessor] sink.LoggerSink (LoggerSink.java:process(94)) - Event: { headers:{Severity=0, Facility=0, flume.syslog.status=Invalid, timestamp=1511409753690} body: 66 66 66 66 66 20 20 49 6E 74 65 72 63 65 70 74 fffff  Intercept }

Static Interceptor

#可添加多个
a1.sources.r1.interceptors = i1 
a1.sources.r1.interceptors.i1.type = static  
a1.sources.r1.interceptors.i1.key = key1  
a1.sources.r1.interceptors.i1.value = value1

#显示结果
 Syslog TCP Source starting...
2017-11-23 12:08:27,975 WARN  [New I/O  worker #1] source.SyslogUtils (SyslogUtils.java:buildEvent(316)) - Event created from Invalid Syslog data.
2017-11-23 12:08:27,980 INFO  [SinkRunner-PollingRunner-DefaultSinkProcessor] sink.LoggerSink (LoggerSink.java:process(94)) - Event: { headers:{key1=value1, Severity=0, Facility=0, flume.syslog.status=Invalid} body: 66 66 66 66 66 20 20 49 6E 74 65 72 63 65 70 74 fffff  Intercept }
          ----->channel --->sink
    sink 
          ----->channel --->sink

a1.sources = r1  
a1.sinks = k1 k2
a1.channels = c1 c2 

# Describe/configure the source  

a1.sources.r1.type = syslogtcp  
a1.sources.r1.port = 50000  
a1.sources.r1.host = 192.168.10.2  
a1.sources.r1.selector.type = multiplexing
a1.sources.r1.channels = c1 c2
a1.sources.r1.selector.header = key1
a1.sources.r1.selector.mapping.value1 = c1 
# 如果header中key1的值为value1,使用c1这个channel
a1.sources.r1.selector.mapping.value2 = c2 
# 如果header中key1的值为value1,使用c2这个channel
a1.sources.r1.selector.default = c1 
# 默认使用c1这个channel

a1.sources.r1.interceptors = i1 i2
a1.sources.r1.interceptors.i1.type = static  
a1.sources.r1.interceptors.i1.key = key1  
a1.sources.r1.interceptors.i1.value = value1
a1.sources.r1.interceptors.i2.type = static  
a1.sources.r1.interceptors.i2.key = key2  
a1.sources.r1.interceptors.i2.value = value2   
# Describe the sink  
a1.sinks.k1.type = logger   
a1.sinks.k2.type = logger   
a1.sinks.k1.channel = c1  
a1.sinks.k2.channel = c2
# Use a channel which buffers events inmemory  
a1.channels.c1.type = memory  
a1.channels.c1.capacity = 1000  
a1.channels.c1.transactionCapacity = 100  
a1.channels.c2.type = memory  
a1.channels.c2.capacity = 1000  
a1.channels.c2.transactionCapacity = 100  
#header
yslog TCP Source starting...
2017-11-23 14:54:31,743 WARN  [New I/O  worker #1] source.SyslogUtils (SyslogUtils.java:buildEvent(316)) - Event created from Invalid Syslog data.
2017-11-23 14:54:33,137 INFO  [SinkRunner-PollingRunner-DefaultSinkProcessor] sink.LoggerSink (LoggerSink.java:process(94)) - Event: { headers:{key1=value1, key2=value2, Severity=0, Facility=0, flume.syslog.status=Invalid} body: 66 66 66 66 66 20 20 49 6E 74 65 72 63 65 70 74 fffff  Intercept }

#配置文件:timestamp_case16.conf  
# Name the components on this agent  
a1.sources = r1  
a1.sinks = k1 k2
a1.channels = c1 c2 

# Describe/configure the source  

a1.sources.r1.type = syslogtcp  
a1.sources.r1.port = 50000  
a1.sources.r1.host = 10.15.184.184 
a1.sources.r1.selector.type = multiplexing
a1.sources.r1.channels = c1 c2
a1.sources.r1.selector.header = key1
a1.sources.r1.selector.mapping.value1 = c1 
# 如果header中validation的值为SUCCESS,使用c2这个channel
a1.sources.r1.selector.mapping.value2 = c2 
# 如果header中validation的值为FAIL,使用c1这个channel
a1.sources.r1.selector.default = c1 
# 默认使用c1这个channel

a1.sources.r1.interceptors = i1 i2
a1.sources.r1.interceptors.i1.type = static  
a1.sources.r1.interceptors.i1.key = key1  
a1.sources.r1.interceptors.i1.value = value1
a1.sources.r1.interceptors.i2.type = static  
a1.sources.r1.interceptors.i2.key = key2  
a1.sources.r1.interceptors.i2.value = value2
# Describe the sink  
a1.sinks.k1.type = file_roll
a1.sinks.k1.channel = c1
a1.sinks.k1.sink.directory = /tmp/log/flume1
a1.sinks.k2.type = file_roll
a1.sinks.k2.channel = c2
a1.sinks.k2.sink.directory = /tmp/log/flume2

# Use a channel which buffers events inmemory  
a1.channels.c1.type = memory  
a1.channels.c1.capacity = 1000  
a1.channels.c1.transactionCapacity = 100  
a1.channels.c2.type = memory  
a1.channels.c2.capacity = 1000  
a1.channels.c2.transactionCapacity = 100

#结果都写入到flume1中
# cat /tmp/log/flume1/1511420933553-1
fffff  Interceptor
fffff  Interceptor
fffff  Interceptor
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值