Flume拦截器
当Source读取events发送到Sink的时候,在events header中加入一些有用的信息,或者对events的内容进行过滤,完成初步的数据清洗。
Flume自带有六种拦截器,分别为时间拦截器、主机拦截器、UUID拦截器、查询拦截器、正则过滤拦截器、正则抽取拦截器。
时间拦截器
a1.sources=r1
a1.sinks=k1
a1.channels=c1
# define sources
a1.sources.r1.type=spooldir
a1.sources.r1.spoolDir=/usr/local/chinatelecom
# define interceptors
# add timestamp in the last of file
a1.sources.r1.interceptors=destiny
a1.sources.r1.interceptors.destiny.type=org.apache.flume.interceptor.TimeStampInterceptor$Builder
# define channels
a1.channels.c1.type=memory
a1.channels.c1.capacity=10000
a1.channels.c1.transactionCapacity=100
# define sinks
a1.sinks.k1.type=hdfs
a1.sinks.k1.hdfs.path=hdfs://hadoop1:9000/flume-interceptor/%H
a1.sinks.k1.hdfs.filePrefix=event-
a1.sinks.k1.hdfs.fileType=DataStream
a1.sinks.k1.hdfs.rollCount=0
a1.sinks.k1.hdfs.rollSize=134217728
a1.sinks.k1.hdfs.rollInterval=60
# component channel,sink,source
a1.sinks.k1.channel=c1
a1.sources.r1.channels=c1
主机拦截器
a1.sources=r1
a1.channels=c1
a1.sinks=k1
# define sources
a1.sources.r1.type=exec
a1.sources.r1.channels=c1
a1.sources.r1.command=tail -F /opt/Destiny
# define interceptors
a1.sources.r1.interceptors=destiny
a1.sources.r1.interceptors.destiny.type=host
# false->hostname true->ip address
a1.sources.r1.interceptors.chinatelecom.userIP=false
a1.sources.r1.interceptors.chinatelecom.hostHeader=agentHost
# define sinks
a1.sinks.k1.type=hdfs
a1.sinks.k1.hdfs.path=hdfs://hadoop1:9000/flumehost/%H
a1.sinks.k1.hdfs.filePrefix=Andy_%{agentHost}
a1.sinks.k1.hdfs.fileSuffix=.log
a1.sinks.k1.hdfs.fileType=DataStream
a1.sinks.k1.hdfs.writeFormat=Text
a1.sinks.k1.hdfs.rollInterval=10
a1.sinks.k1.hdfs.useLocalTimeStamp=true
# define channels
a1.channels.c1.type=memory
a1.channels.c1.capacity=10000
a1.channels.c1.transactionCapacity=100
# component channels,sources,sinks
a1.sources.r1.channels=c1
a1.sinks.k1.channel=c1
UUID拦截器
a1.sources=r1
a1.sinks=k1
a1.channels=c1
# define sources
a1.sources.r1.type=exec
a1.sources.r1.command=tail -F /usr/Local/UUID
a1.sources.r1.channels=c1
# define interceptors
a1.sources.r1.interceptors=destiny
a1.sources.r1.interceptors.destiny.type=org.apache.flume.sink.solr.morphline.UUIDInterceptor$Builder
a1.sources.r1.interceptors.chinatelecom.preserveExisting=true
a1.sources.r1.interceptors.chinatelecom.prefix=UUID_
# define sinks
a1.sinks.k1.type=logger
# define channels
a1.channels.c1.type=memory
a1.channels.c1.capacity=10000
a1.channels.c1.transactionCapacity=100
# component sources,sinks,channels
a1.sources.r1.channels=c1
a1.sinks.k1.channel=c1
查询拦截器
a1.sources=r1
a1.channels=c1
a1.sinks=k1
# define sources
a1.sources.r1.type=exec
a1.sources.r1.command=tail -F /usr/local/select
a1.sources.r1.channels=c1
# define interceptors
a1.sources.r1.interceptors=destiny
a1.sources.r1.interceptors.destiny.type=search_replace
a1.sources.r1.interceptors.destiny.searchPattern=\d+
a1.sources.r1.interceptors.destiny.replaceString=destiny
a1.sources.r1.interceptors.destiny.charset=UTF-8
# define sinks
a1.sinks.k1.type=logger
# define channels
a1.channels.c1.type=memory
a1.channels.c1.capacity=10000
a1.channels.c1.transactionCapacity=100
# component
a1.sources.r1.channels=c1
a1.sinks.k1.channel=c1
正则过滤拦截器
a1.sources=r1
a1.channels=c1
a1.sinks=k1
# define sources
a1.sources.r1.type=exec
a1.sources.r1.command=tail -F /usr/local/select
a1.sources.r1.channels=c1
# define interceptors
a1.sources.r1.interceptors=destiny
a1.sources.r1.interceptors.destiny.type=regex_filter
a1.sources.r1.interceptors.destiny.regex=^A.*
# ture表示过滤掉以A开头的event,false表示过滤掉不是以A开头的event
a1.sources.r1.interceptors.destiny.excludeEvent=true
# define sinks
a1.sinks.k1.type=logger
# define channels
a1.channels.c1.type=memory
a1.channels.c1.capacity=10000
a1.channels.c1.transactionCapacity=100
# component
a1.sources.r1.channels=c1
a1.sinks.k1.channel=c1
正则抽取拦截器
a1.sources=r1
a1.channels=c1
a1.sinks=k1
# define sources
a1.sources.r1.type=exec
a1.sources.r1.command=tail -F /usr/local/select
a1.sources.r1.channels=c1
# define interceptors
a1.sources.r1.interceptors=destiny
a1.sources.r1.interceptors.destiny.type=regex_extractor
a1.sources.r1.interceptors.destiny.regex=hostname is (.*?) ip is(.*)
a1.sources.r1.interceptors.destiny.serializers=s1 s2
a1.sources.r1.interceptors.destiny.serializers.s1.name=cookieid
a1.sources.r1.interceptors.destiny.seralizers.s2.name=ip
# define sinks
a1.sinks.k1.type=logger
# define channels
a1.channels.c1.type=memory
a1.channels.c1.capacity=10000
a1.channels.c1.transactionCapacity=100
# component
a1.sources.r1.channels=c1
a1.sinks.k1.channel=c1
自定义拦截器
<!-- https://mvnrepository.com/artifact/org.apache.flume/flume-ng-core -->
<dependency>
<groupId>org.apache.flume</groupId>
<artifactId>flume-ng-core</artifactId>
<version>1.8.0</version>
</dependency>
package com.maven.flume;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import java.util.ArrayList;
import java.util.List;
/**
* @author Administrator
*/
public class FlumeInterceptor implements Interceptor {
@Override
public void initialize() {
}
@Override
public Event intercept(Event event) {
byte[] body = event.getBody();
event.setBody(new StringBuilder().append(new String(body)).reverse().toString().getBytes());
return null;
}
@Override
public List<Event> intercept(List<Event> eventList) {
ArrayList<Event> list = new ArrayList<>();
for (Event event: eventList){
list.add(intercept(event));
}
return list;
}
@Override
public void close() {
}
private static class FlumeBuilder implements Builder{
@Override
public Interceptor build() {
return new FlumeInterceptor();
}
@Override
public void configure(Context context) {
}
}
}
# agent
a1.sources=r1
a1.sinks=k1
a1.channels=c1
# define sources
a1.sources.r1.type=exec
a1.sources.r1.command=tail -F -c +0 /usr/local/destiny.csv
a1.sources.r1.shell=/bin/bash -c
# define interceptors
a1.sources.r1.interceptors=destiny
a1.sources.r1.interceptors.destiny.type=com.maven.flume.FlumeInterceptor$Builder
# define sinks
a1.sinks.k1.type=hdfs
a1.sinks.k1.hdfs.path=hdfs://hadoop1:9000/destiny/%H
a1.sinks.k1.hdfs.fileType=DataStream
a1.sinks.k1.hdfs.filePrefix=Destiny-
a1.sinks.k1.hdfs.roundCount=60
a1.sinks.k1.hdfs.roundValue=1
a1.sinks.k1.hdfs.roundUnit=hour
a1.sinks.k1.hdfs.rollCount=0
a1.sinks.k1.hdfs.rollSize=134217728
a1.sinks.k1.hdfs.rollInterval=60
a1.sinks.k1.hdfs.useLocalTimeStamp=true
# define channels
a1.channels.c1.type=memory
a1.channels.c1.capacity=10000
a1.channels.c1.transactionCapacity=100
# component
a1.sources.r1.channels=c1
a1.sinks.k1.channel=k1
Fluem启动命令
flume-ng agent -n a1 -conf ./conf -conf-file ./conf/xx.conf -Dflume.root.logger=INFO.console