拦截器是简单的插件式组件,设置在source和channel之间。source接收到的时间,在写入channel之前,拦截器都可以进行转换或者删除这些事件。每个拦截器只处理同一个source接收到的事件。flume官方实现了很多拦截器也可以自定义拦截器。通过实现自定义的拦截器可以对日志进行ETL。
自定义拦截器只需要实现Interceptor的继承类。例子代码如下(经过测试检验过的)。
Java文件
1. OfflineDataFlumeInterceptor.java
package com.mycompany;
import com.alibaba.fastjson.JSONObject;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.ZoneId;
import java.util.*;
import static com.mycompany.Constants.*;
/**
* 自定义拦截器,实现Interceptor接口,并且实现其抽象方法
*/
public class OfflineDataFlumeInterceptor implements Interceptor {
private static final Logger logger = LoggerFactory.getLogger(OfflineDataFlumeInterceptor.class);
//自定义拦截器参数,用来接收自定义拦截器flume配置参数
private static String DataFormat = "";
private boolean procesBinMsg(Event event) {
//logger.info("enter process Bin message");
Map<String, String> headers = event.getHeaders();
String rawmsg = null;
try {
rawmsg = new String(event.getBody(), "UTF-8");
} catch (UnsupportedEncodingException e) {
logger.info("procesBinMsg(), exceptions happend:{}", e.getMessage());
//e.getStackTrace();
return false;
}
JSONObject jsonobj = JSONObject.parseObject(rawmsg);
if (null == jsonobj) {
logger.warn("procesBinMsg(), invalid data format, can't convert into json object ");
return false;
}
String clientid = jsonobj.getString("clientid");
headers.put("clientid", clientid);
//logger.info("client:{}", clientid);
String encrptedPayload = jsonobj.getString("payload");
if (null == encrptedPayload) {
logger.warn("procesBinMsg(), can't get base64 payload");
return false;
}
//base64 Decoder
/*
payload: 采用base64编码,要用base64解码 decode.
2位length+6位time+2位ID
0-1 based64解码后的长度。
2-7 long: timestamp(毫秒)采集时间,BIG_ENDIAN。
8-9: 设备orderID, 数字格式。
*/
final Base64.Decoder decoder = Base64.getDecoder();
byte[] decrptedPayload = decoder.decode((encrptedPayload));
if (decrptedPayload.length < 10) {
logger.warn("procesBinMsg(), invalid payload data");
return false;
}
//data stamped time (ms)
// ByteBuffer 默认为大端(BIG_ENDIAN)模式
long msseconds = 0L;
for (int i = 2; i < 8; i++) {
msseconds = (msseconds << 8) | (decrptedPayload[i] & 0xFF);
}
//logger.info("payload ms=" + msseconds);
Date dataStampeddDate = new Date(msseconds);
SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
formatter.setTimeZone(TimeZone.getTimeZone(ZoneId.of("UTC")));
headers.put("dataday", formatter.format(dataStampeddDate));
// logger.info("dataday:{}", formatter.format(dataStampeddDate));
SimpleDateFormat hourformatter = new SimpleDateFormat("HH");
hourformatter.setTimeZone(TimeZone.getTimeZone(ZoneId.of("UTC")));
headers.put("datahour", hourformatter.format(dataStampeddDate));
// logger.info("hour:{}", hourformatter.format(dataStampeddDate));
//orderID 2 byte
ByteBuffer orderIDbytes = ByteBuffer.wrap(decrptedPayload, 8, 2);
short orderID = orderIDbytes.getShort();
headers.put("orderid", orderID + "");
return true;
}
private boolean processEXJsonMsg(Event event) {
// logger.info("begin to process EXJson message");
Map<String, String> headers = event.getHeaders();
String rawmsg = null;
try {
rawmsg = new String(event.getBody(), "UTF-8");
} catch (UnsupportedEncodingException e) {
logger.info("processEXJsonMsg(), exceptions happend:{}", e.getMessage());
return false;
}
JSONObject jsonobj = null;
try {
jsonobj = JSONObject.parseObject(rawmsg);
} catch (RuntimeException e) {
logger.warn("processEXJsonMsg(), can't convert into json object:{}", rawmsg);
return false;
}
if (null == jsonobj) {
logger.warn("processEXJsonMsg(), invalid data format, can't convert into json object:{}", rawmsg);
return false;
}
// logger.info("retrieved jason content:{}",jsonobj);
String clientid = jsonobj.getString("_gatewayId");
headers.put("clientid", clientid);
//logger.info("clientID:{}", clientid);
JSONObject payloadobj = jsonobj.getJSONObject("data");
if (null == payloadobj) {
logger.warn("processEXJsonMsg(), can't get payload");
return false;
}
//data stamped datetime
//"timestamp": "2019-07-22 18:05:16.881" //数据采集时间,UTC
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS");
sdf.setTimeZone(TimeZone.getTimeZone(ZoneId.of("UTC")));
Date datatime = null;
try {
String mytime=payloadobj.getString("timestamp");
if(null == mytime){
logger.warn("No field timestamp value in data section!");
return false;
}
datatime = sdf.parse(mytime);
} catch (ParseException e) {
logger.info("processEXJsonMsg(), exceptions happend:{}", e.getMessage());
return false;
}
SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
formatter.setTimeZone(TimeZone.getTimeZone(ZoneId.of("UTC")));
headers.put("dataday", formatter.format(datatime));
//logger.info("dataday:{}", formatter.format(datatime));
SimpleDateFormat datahour = new SimpleDateFormat("HH");
datahour.setTimeZone(TimeZone.getTimeZone("UTC"));
headers.put("datahour", datahour.format(datatime));
// logger.info("hour:{}", formatter.format(datahour));
int orderID = jsonobj.getIntValue("_order");
headers.put("orderid", orderID + "");
return true;
}
//Json msg format
private boolean processJsonMsg(Event event) {
logger.info("not implemented yet");
return true;
}
/**
* 拦截器构造方法,在自定义拦截器静态内部类的build方法中调用,用来创建自定义拦截器对象。
*/
public OfflineDataFlumeInterceptor() {
//logger.info("--- OfflineDataFlumeInterceptor: here ---");
}
/**
* 该方法用来初始化拦截器,在拦截器的构造方法执行之后执行,也就是创建完拦截器对象之后执行
*/
@Override
public void initialize() {
logger.info("--- OfflineDataFlumeInterceptor: initialize here ---");
}
/**
* 用来处理每一个event对象,该方法不会被系统自动调用,
* 一般在 List<Event> intercept(List<Event> events) 方法内部调用。
*
* @param event
* @return
*/
@Override
public Event intercept(Event event) {
Map<String, String> headers = event.getHeaders();
if (headers.isEmpty()) {
logger.warn("intercept() headers is empty!");
return null;
}
// logger.info("headers:{}", headers);
if (event.getBody().length == 0) {
logger.warn("intercept() data body empty!");
return null;
}
//get raw data format firstly
if (null == DataFormat) {
logger.warn("file dataformat is NOT set!");
return null;
}
boolean ret = false;
switch (DataFormat) {
case JSONFORMAT:
ret = processJsonMsg(event);
if (ret == false) {
return null;
}
break;
case BINFORMAT:
ret = procesBinMsg(event);
if (ret == false) {
return null;
}
break;
case EXJSONFORMAT:
ret = processEXJsonMsg(event);
if (ret == false) {
return null;
}
break;
default:
logger.error("Unsupported data format:{}",DataFormat);
return null;
}
return event;
}
/**
* 用来处理一批event对象集合,集合大小与flume启动配置有关,和transactionCapacity大小保持一致。
* 一般直接调用 Event intercept(Event event) 处理每一个event数据。
*
* @param events
* @return
*/
@Override
public List<Event> intercept(List<Event> events) {
List<Event> results = new ArrayList<>();
Event event = null;
for (Event e : events) {
event = intercept(e);
if (event != null) {
results.add(event);
}
}
return results;
}
/**
* 该方法主要用来销毁拦截器对象值执行,一般是一些释放资源的处理
*/
@Override
public void close() {
logger.info("- OfflineDataFlumeInterceptor close() here --");
}
/**
* 通过该静态内部类来创建自定义对象供flume使用,实现Interceptor.Builder接口,并实现其抽象方法
*/
public static class Builder implements Interceptor.Builder {
/**
* 该方法主要用来返回创建的自定义类拦截器对象
*
* @return
*/
@Override
public Interceptor build() {
//logger.info("-- OfflineDataFlumeInterceptor build here --");
return new OfflineDataFlumeInterceptor();
}
/**
* 用来接收flume配置自定义拦截器参数
*
* @param context 通过该对象可以获取flume配置自定义拦截器的参数
*/
@Override
public void configure(Context context) {
/*
通过调用context对象的getString方法来获取flume配置自定义拦截器的参数,方法参数要和自定义拦截器配置中的参数保持一致+
*/
DataFormat = context.getString("DataFormat");
logger.info("configure(): DataFormat:{}", DataFormat);
}
}
}
2. Constants.java
package com.mycompany;
public class Constants {
public static final String JSONFORMAT="aiotjson";
public static final String EXJSONFORMAT="aiotexjson";
public static final String BINFORMAT="aiotbinjson";
}
3. pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.mycompany.aiot</groupId>
<artifactId>offlineFlume</artifactId>
<version>1.0</version>
<packaging>jar</packaging>
<name>AIOT offlne Flume inteceptor</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<java.version>1.8</java.version>
<maven-plugins.version>3.1.1</maven-plugins.version>
<maven-jar.version>3.1.0</maven-jar.version>
<maven-compiler.version>3.7.0</maven-compiler.version>
<maven-resource.version>3.0.1</maven-resource.version>
<maven-source.version>3.0.1</maven-source.version>
<maven-dependency.version>3.1.1</maven-dependency.version>
<maven-assembly.version>3.1.0</maven-assembly.version>
<logback.version>1.2.3</logback.version>
<slf4j.version>1.7.25</slf4j.version>
<es.version>6.3.2</es.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.flume</groupId>
<artifactId>flume-ng-sdk</artifactId>
<version>1.8.0</version>
</dependency>
<dependency>
<groupId>org.apache.flume</groupId>
<artifactId>flume-ng-core</artifactId>
<version>1.8.0</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
<!-- json -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.60</version>
</dependency>
<dependency>
<groupId>joda-time</groupId>
<artifactId>joda-time</artifactId>
<version>2.9.9</version>
</dependency>
</dependencies>
<build>
<sourceDirectory>src/main/java</sourceDirectory>
<finalName>offlineFlume</finalName>
<defaultGoal>package</defaultGoal>
<plugins>
<!-- 用于编译的plugin -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>${maven-compiler.version}</version>
<configuration>
<fork>true</fork>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.2.1</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>com.mycompany.OfflineDataFlumeInterceptor</mainClass><!--main函数所在的类-->
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>META-INF/spring.handlers</resource>
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>META-INF/spring.schemas</resource>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
部署方法:
1. 把上面的java代码编译成offerlineFlume.jar。
2. flume安装完成后,把上面产生的jar包offerlineFlume.jar 拷贝到 flume的安装目录下的 plugins.d/flume-plugins/lib 下面。
3. 修改 flume的安装目录下的 conf/flume.conf,关键点如下
# Sources, channels, and sinks are defined per
# agent name, in this case 'myagent1'.
myagent1.sources = ds1
myagent1.channels = channel1
myagent1.sinks = hdfs1
# For each source, channel, and sink, set
# standard properties.
#my interceptor
myagent1.sources.ds1.interceptors = i1
myagent1.sources.ds1.interceptors.i1.type = com.mycompany.OfflineDataFlumeInterceptor$Builder
myagent1.sources.ds1.interceptors.i1.DataFormat = aiotbinjson
myagent1.sources.ds1.type = org.apache.flume.source.kafka.KafkaSource
myagent1.sources.ds1.channels = channel1
myagent1.sources.ds1.kafka.topics = RTDPBSF
myagent1.sources.ds1.kafka.bootstrap.servers = 10.251.7.137:9092,10.251.7.138:9092,10.251.7.139:9092
myagent1.sources.ds1.kafka.consumer.auto.offset.reset = earliest
myagent1.sources.ds1.kafka.consumer.group.id = offlineFlumeGroup2
myagent1.sources.ds1.batchSize = 500
myagent1.sources.ds1.batchDurationMillis = 2000
myagent1.channels.channel1.type = memory
myagent1.channels.channel1.capacity = 100000
myagent1.channels.channel1.transactionCapacity = 50000
myagent1.channels.channel1.keep-alive = 60
myagent1.sinks.hdfs1.type = hdfs
myagent1.sinks.hdfs1.channel = channel1
myagent1.sinks.hdfs1.hdfs.path = /druiddata/binjson/%{clientid}_%{orderid}/%{dataday}_%{datahour}/
myagent1.sinks.hdfs1.hdfs.filePrefix = offline_binjson
#myagent1.sinks.hdfs1.hdfs.fileSuffix = .data
myagent1.sinks.hdfs1.hdfs.round=false
#控制文件的滚动频率
#event数量维度
myagent1.sinks.hdfs1.hdfs.rollInterval = 0
myagent1.sinks.hdfs1.hdfs.rollSize = 0
myagent1.sinks.hdfs1.hdfs.rollCount = 100000
myagent1.sinks.hdfs1.hdfs.batchSize = 5000
myagent1.sinks.hdfs1.hdfs.idleTimeout = 10
myagent1.sinks.hdfs1.hdfs.threadsPoolSize = 10
myagent1.sinks.hdfs1.hdfs.useLocalTimeStamp = true
#生成的文件类型,默认是Sequencefile,可用DataStream则为普通文本
#myagent1.sinks.hdfs1.hdfs.fileType = DataStream
#myagent1.sinks.hdfs1.hdfs.writeFormat = Text
myagent1.sinks.hdfs1.hdfs.fileType = CompressedStream
#myagent1.sinks.hdfs1.hdfs.codeC = gzip
#bzip2 支持切分压缩
myagent1.sinks.hdfs1.hdfs.codeC = bzip2
注意下面的配置
#my interceptor
myagent1.sources.ds1.interceptors = i1
myagent1.sources.ds1.interceptors.i1.type = com.mycompany.OfflineDataFlumeInterceptor$Builder
myagent1.sources.ds1.interceptors.i1.DataFormat = aiotbinjson
官方文档:http://flume.apache.org/releases/content/1.9.0/FlumeUserGuide.html