Flume定制拦截器 Interceptor 实战

拦截器是简单的插件式组件,设置在source和channel之间。source接收到的时间,在写入channel之前,拦截器都可以进行转换或者删除这些事件。每个拦截器只处理同一个source接收到的事件。flume官方实现了很多拦截器也可以自定义拦截器。通过实现自定义的拦截器可以对日志进行ETL。
自定义拦截器只需要实现Interceptor的继承类。例子代码如下(经过测试检验过的)。

Java文件
1. OfflineDataFlumeInterceptor.java

package com.mycompany;

import com.alibaba.fastjson.JSONObject;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.ZoneId;
import java.util.*;

import static com.mycompany.Constants.*;

/**
 * 自定义拦截器,实现Interceptor接口,并且实现其抽象方法
 */
public class OfflineDataFlumeInterceptor implements Interceptor {
    private static final Logger logger = LoggerFactory.getLogger(OfflineDataFlumeInterceptor.class);
    //自定义拦截器参数,用来接收自定义拦截器flume配置参数
    private static String DataFormat = "";

    private boolean procesBinMsg(Event event) {
        //logger.info("enter process Bin message");
        Map<String, String> headers = event.getHeaders();
        String rawmsg = null;
        try {
            rawmsg = new String(event.getBody(), "UTF-8");
        } catch (UnsupportedEncodingException e) {
            logger.info("procesBinMsg(), exceptions happend:{}", e.getMessage());
            //e.getStackTrace();
            return false;
        }

        JSONObject jsonobj = JSONObject.parseObject(rawmsg);
        if (null == jsonobj) {
            logger.warn("procesBinMsg(), invalid data format, can't convert into json object ");
            return false;
        }

        String clientid = jsonobj.getString("clientid");
        headers.put("clientid", clientid);
        //logger.info("client:{}", clientid);
        String encrptedPayload = jsonobj.getString("payload");
        if (null == encrptedPayload) {
            logger.warn("procesBinMsg(), can't get base64 payload");
            return false;
        }
        //base64 Decoder
        /*
        payload: 采用base64编码,要用base64解码 decode.
      2位length+6位time+2位ID
      0-1 based64解码后的长度。
      2-7 long: timestamp(毫秒)采集时间,BIG_ENDIAN。
      8-9: 设备orderID, 数字格式。
         */
        final Base64.Decoder decoder = Base64.getDecoder();
        byte[] decrptedPayload = decoder.decode((encrptedPayload));
        if (decrptedPayload.length < 10) {
            logger.warn("procesBinMsg(), invalid payload data");
            return false;
        }
        //data stamped time (ms)
        // ByteBuffer 默认为大端(BIG_ENDIAN)模式

        long msseconds = 0L;
        for (int i = 2; i < 8; i++) {
            msseconds = (msseconds << 8) | (decrptedPayload[i] & 0xFF);
        }
        //logger.info("payload ms=" + msseconds);
        Date dataStampeddDate = new Date(msseconds);

        SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
        formatter.setTimeZone(TimeZone.getTimeZone(ZoneId.of("UTC")));
        headers.put("dataday", formatter.format(dataStampeddDate));
        // logger.info("dataday:{}", formatter.format(dataStampeddDate));

        SimpleDateFormat hourformatter = new SimpleDateFormat("HH");
        hourformatter.setTimeZone(TimeZone.getTimeZone(ZoneId.of("UTC")));

        headers.put("datahour", hourformatter.format(dataStampeddDate));
        // logger.info("hour:{}", hourformatter.format(dataStampeddDate));

        //orderID 2 byte
        ByteBuffer orderIDbytes = ByteBuffer.wrap(decrptedPayload, 8, 2);
        short orderID = orderIDbytes.getShort();
        headers.put("orderid", orderID + "");
        return true;
    }

   
    private boolean processEXJsonMsg(Event event) {
        // logger.info("begin to process EXJson message");
        Map<String, String> headers = event.getHeaders();
        String rawmsg = null;
        try {
            rawmsg = new String(event.getBody(), "UTF-8");
        } catch (UnsupportedEncodingException e) {
            logger.info("processEXJsonMsg(), exceptions happend:{}", e.getMessage());
            return false;
        }
        JSONObject jsonobj = null;
        try {
            jsonobj = JSONObject.parseObject(rawmsg);
        } catch (RuntimeException e) {
            logger.warn("processEXJsonMsg(), can't convert into json object:{}", rawmsg);
            return false;
        }
        if (null == jsonobj) {
            logger.warn("processEXJsonMsg(), invalid data format, can't convert into json object:{}", rawmsg);
            return false;
        }
        // logger.info("retrieved jason content:{}",jsonobj);
        String clientid = jsonobj.getString("_gatewayId");
        headers.put("clientid", clientid);
        //logger.info("clientID:{}", clientid);
        JSONObject payloadobj = jsonobj.getJSONObject("data");
        if (null == payloadobj) {
            logger.warn("processEXJsonMsg(), can't get payload");
            return false;
        }
        //data stamped datetime
        //"timestamp": "2019-07-22 18:05:16.881" //数据采集时间,UTC
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS");
        sdf.setTimeZone(TimeZone.getTimeZone(ZoneId.of("UTC")));
        Date datatime = null;
        try {
            String mytime=payloadobj.getString("timestamp");
            if(null == mytime){
                logger.warn("No field timestamp value in data section!");
                return false;
            }
            datatime = sdf.parse(mytime);
        } catch (ParseException e) {
            logger.info("processEXJsonMsg(), exceptions happend:{}", e.getMessage());
            return false;
        }

        SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
        formatter.setTimeZone(TimeZone.getTimeZone(ZoneId.of("UTC")));
        headers.put("dataday", formatter.format(datatime));
        //logger.info("dataday:{}", formatter.format(datatime));

        SimpleDateFormat datahour = new SimpleDateFormat("HH");
        datahour.setTimeZone(TimeZone.getTimeZone("UTC"));
        headers.put("datahour", datahour.format(datatime));
        // logger.info("hour:{}", formatter.format(datahour));
        int orderID = jsonobj.getIntValue("_order");
        headers.put("orderid", orderID + "");
        return true;
    }

    //Json msg format

    private boolean processJsonMsg(Event event) {
        logger.info("not implemented yet");
        return true;
    }

    /**
     * 拦截器构造方法,在自定义拦截器静态内部类的build方法中调用,用来创建自定义拦截器对象。
     */
    public OfflineDataFlumeInterceptor() {
        //logger.info("--- OfflineDataFlumeInterceptor: here ---");
    }

    /**
     * 该方法用来初始化拦截器,在拦截器的构造方法执行之后执行,也就是创建完拦截器对象之后执行
     */
    @Override
    public void initialize() {
        logger.info("--- OfflineDataFlumeInterceptor: initialize here  ---");
    }
    /**
     * 用来处理每一个event对象,该方法不会被系统自动调用,
     * 一般在 List<Event> intercept(List<Event> events) 方法内部调用。
     *
     * @param event
     * @return
     */
    @Override
    public Event intercept(Event event) {
        Map<String, String> headers = event.getHeaders();
        if (headers.isEmpty()) {
            logger.warn("intercept() headers is empty!");
            return null;
        }
        // logger.info("headers:{}", headers);
        if (event.getBody().length == 0) {
            logger.warn("intercept() data body empty!");
            return null;
        }
        //get raw data  format firstly
        if (null == DataFormat) {
            logger.warn("file dataformat is NOT set!");
            return null;
        }
        boolean ret = false;
        switch (DataFormat) {
            case JSONFORMAT:
                ret = processJsonMsg(event);
                if (ret == false) {
                    return null;
                }
                break;
            case BINFORMAT:
                ret = procesBinMsg(event);
                if (ret == false) {
                    return null;
                }
                break;
            case EXJSONFORMAT:
                ret = processEXJsonMsg(event);
                if (ret == false) {
                    return null;
                }
                break;
            default:
                logger.error("Unsupported data format:{}",DataFormat);
                return null;
        }
        return event;
    }

    /**
     * 用来处理一批event对象集合,集合大小与flume启动配置有关,和transactionCapacity大小保持一致。
     * 一般直接调用 Event intercept(Event event) 处理每一个event数据。
     *
     * @param events
     * @return
     */
    @Override
    public List<Event> intercept(List<Event> events) {
        List<Event> results = new ArrayList<>();
        Event event = null;
        for (Event e : events) {
            event = intercept(e);
            if (event != null) {
                results.add(event);
            }
        }
        return results;
    }

    /**
     * 该方法主要用来销毁拦截器对象值执行,一般是一些释放资源的处理
     */
    @Override
    public void close() {
        logger.info("- OfflineDataFlumeInterceptor close() here --");
    }

    /**
     * 通过该静态内部类来创建自定义对象供flume使用,实现Interceptor.Builder接口,并实现其抽象方法
     */
    public static class Builder implements Interceptor.Builder {
        /**
         * 该方法主要用来返回创建的自定义类拦截器对象
         *
         * @return
         */
        @Override
        public Interceptor build() {
            //logger.info("-- OfflineDataFlumeInterceptor build here --");
            return new OfflineDataFlumeInterceptor();
        }

        /**
         * 用来接收flume配置自定义拦截器参数
         *
         * @param context 通过该对象可以获取flume配置自定义拦截器的参数
         */
        @Override
        public void configure(Context context) {
             /*
            通过调用context对象的getString方法来获取flume配置自定义拦截器的参数,方法参数要和自定义拦截器配置中的参数保持一致+
             */
            DataFormat = context.getString("DataFormat");
            logger.info("configure(): DataFormat:{}", DataFormat);
        }
    }
}

2. Constants.java

package com.mycompany;

public class Constants {
    public static final String JSONFORMAT="aiotjson";
    public static final String EXJSONFORMAT="aiotexjson";
    public static final String BINFORMAT="aiotbinjson";
}

3. pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <groupId>com.mycompany.aiot</groupId>
    <artifactId>offlineFlume</artifactId>
    <version>1.0</version>
    <packaging>jar</packaging>
    <name>AIOT offlne Flume inteceptor</name>
    <url>http://maven.apache.org</url>

    <properties>
            <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
            <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
            <java.version>1.8</java.version>
            <maven-plugins.version>3.1.1</maven-plugins.version>
            <maven-jar.version>3.1.0</maven-jar.version>
            <maven-compiler.version>3.7.0</maven-compiler.version>
            <maven-resource.version>3.0.1</maven-resource.version>
            <maven-source.version>3.0.1</maven-source.version>
            <maven-dependency.version>3.1.1</maven-dependency.version>
            <maven-assembly.version>3.1.0</maven-assembly.version>
            <logback.version>1.2.3</logback.version>
            <slf4j.version>1.7.25</slf4j.version>
        <es.version>6.3.2</es.version>
    </properties>
    <dependencies>
        <dependency>
            <groupId>org.apache.flume</groupId>
            <artifactId>flume-ng-sdk</artifactId>
            <version>1.8.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flume</groupId>
            <artifactId>flume-ng-core</artifactId>
            <version>1.8.0</version>
        </dependency>
        <dependency>
            <groupId>log4j</groupId>
            <artifactId>log4j</artifactId>
            <version>1.2.17</version>
        </dependency>
        <!-- json -->
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.60</version>
        </dependency>
        <dependency>
            <groupId>joda-time</groupId>
            <artifactId>joda-time</artifactId>
            <version>2.9.9</version>
        </dependency>
    </dependencies>
    <build>
        <sourceDirectory>src/main/java</sourceDirectory>
        <finalName>offlineFlume</finalName>
        <defaultGoal>package</defaultGoal>
        <plugins>
            <!-- 用于编译的plugin -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>${maven-compiler.version}</version>
                <configuration>
                    <fork>true</fork>
                    <source>1.8</source>
                    <target>1.8</target>
                    <encoding>UTF-8</encoding>
                </configuration>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>3.2.1</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>

                        <configuration>
                            <filters>
                                <filter>
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*.RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                            <transformers>
                                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                                    <mainClass>com.mycompany.OfflineDataFlumeInterceptor</mainClass><!--main函数所在的类-->
                                </transformer>
                                <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
                                    <resource>META-INF/spring.handlers</resource>
                                </transformer>
                                <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
                                    <resource>META-INF/spring.schemas</resource>
                                </transformer>
                            </transformers>
                        </configuration>

                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

</project>

部署方法:

1. 把上面的java代码编译成offerlineFlume.jar。

2. flume安装完成后,把上面产生的jar包offerlineFlume.jar 拷贝到  flume的安装目录下的  plugins.d/flume-plugins/lib 下面。

3. 修改 flume的安装目录下的 conf/flume.conf,关键点如下

# Sources, channels, and sinks are defined per
# agent name, in this case 'myagent1'.
myagent1.sources  = ds1
myagent1.channels = channel1
myagent1.sinks    = hdfs1

# For each source, channel, and sink, set
# standard properties.

#my interceptor
myagent1.sources.ds1.interceptors = i1
myagent1.sources.ds1.interceptors.i1.type = com.mycompany.OfflineDataFlumeInterceptor$Builder 
myagent1.sources.ds1.interceptors.i1.DataFormat = aiotbinjson


myagent1.sources.ds1.type  = org.apache.flume.source.kafka.KafkaSource
myagent1.sources.ds1.channels = channel1


myagent1.sources.ds1.kafka.topics = RTDPBSF
myagent1.sources.ds1.kafka.bootstrap.servers = 10.251.7.137:9092,10.251.7.138:9092,10.251.7.139:9092
myagent1.sources.ds1.kafka.consumer.auto.offset.reset = earliest 

myagent1.sources.ds1.kafka.consumer.group.id = offlineFlumeGroup2
myagent1.sources.ds1.batchSize = 500
myagent1.sources.ds1.batchDurationMillis = 2000

myagent1.channels.channel1.type = memory
myagent1.channels.channel1.capacity  = 100000 
myagent1.channels.channel1.transactionCapacity = 50000
myagent1.channels.channel1.keep-alive = 60

myagent1.sinks.hdfs1.type   = hdfs
myagent1.sinks.hdfs1.channel  = channel1
myagent1.sinks.hdfs1.hdfs.path = /druiddata/binjson/%{clientid}_%{orderid}/%{dataday}_%{datahour}/
myagent1.sinks.hdfs1.hdfs.filePrefix = offline_binjson 
#myagent1.sinks.hdfs1.hdfs.fileSuffix = .data

myagent1.sinks.hdfs1.hdfs.round=false
#控制文件的滚动频率
#event数量维度
myagent1.sinks.hdfs1.hdfs.rollInterval = 0
myagent1.sinks.hdfs1.hdfs.rollSize = 0
myagent1.sinks.hdfs1.hdfs.rollCount = 100000
myagent1.sinks.hdfs1.hdfs.batchSize = 5000
myagent1.sinks.hdfs1.hdfs.idleTimeout = 10
myagent1.sinks.hdfs1.hdfs.threadsPoolSize = 10
myagent1.sinks.hdfs1.hdfs.useLocalTimeStamp = true
#生成的文件类型,默认是Sequencefile,可用DataStream则为普通文本
#myagent1.sinks.hdfs1.hdfs.fileType = DataStream
#myagent1.sinks.hdfs1.hdfs.writeFormat = Text
myagent1.sinks.hdfs1.hdfs.fileType = CompressedStream 
#myagent1.sinks.hdfs1.hdfs.codeC = gzip
#bzip2 支持切分压缩
myagent1.sinks.hdfs1.hdfs.codeC = bzip2

注意下面的配置

#my interceptor
myagent1.sources.ds1.interceptors = i1
myagent1.sources.ds1.interceptors.i1.type = com.mycompany.OfflineDataFlumeInterceptor$Builder 
myagent1.sources.ds1.interceptors.i1.DataFormat = aiotbinjson

官方文档:http://flume.apache.org/releases/content/1.9.0/FlumeUserGuide.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值