Flume自定义Source

最新推荐文章于 2024-08-06 22:24:16 发布

xqg1316

最新推荐文章于 2024-08-06 22:24:16 发布

阅读量521

点赞数

分类专栏： Flume 文章标签： Flume

本文链接：https://blog.csdn.net/Migumigu1316/article/details/88245783

版权

Flume 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

一、在使用flume采集日志时，可以通过flume进行监控某一个文件把生产的数据传输给指定的sink，但是如果某段时间flume所在机器宕机了，那么当重新启动后，在去监控时，会导致有数据丢失，不是接着上一次的数据继续进行读取，因此针对这种情况时可能需要我们自定义一个source，记录偏移量，每次都是接着上次继续读
二、下面就是具体实现的代码
再写代码时可以参照官方给的source的源码进行编写，比如ExecSource
flume的生命周期：先执行构造器，再执行 config方法 --> start方法 --> processor.process–> stop
读取配置文件:(配置读取的文件内容：读取那个文件，编码及、偏移量写到那个文件，多长时间检测一下文件是否有新内容

package com.flume.source;

import org.apache.commons.io.FileUtils;
import org.apache.flume.Context;
import org.apache.flume.EventDrivenSource;
import org.apache.flume.channel.ChannelProcessor;
import org.apache.flume.conf.Configurable;
import org.apache.flume.event.EventBuilder;
import org.apache.flume.source.AbstractSource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.RandomAccessFile;
//import java.nio.charset.Charset;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

/**
 * @ClassName: com.flume.source
 * @Description: 自定义source，记录偏移量
 * @Date: 2019/03/06 16:27
 * @Version:
 */

//TODO 切记:开发时引入依赖，打包是不包含依赖的

public class MyFlumeSource extends AbstractSource implements Configurable, EventDrivenSource {

    //记录日志
    private static final Logger logger = LoggerFactory.getLogger(MyFlumeSource.class);

    //数据源的文件
    private String filePath;

    //保存Offset偏移量的文件
    private String positionFile;

    //等待时长
    private Long interval;

    //编码格式
    private String charset;

    private FileRunnable fileRunnable;

    private ExecutorService pool;


    /**
     * 读取配置文件（flume在执行一次job时定义的配置文件）
     * 初始化Flume配置信息
     *
     * @param context
     */

    @Override
    public void configure(Context context) {

        //读取哪个文件
        filePath = context.getString("filePath");

        //把Offset偏移量写到哪
        positionFile = context.getString("positionFile");

        //TODO 指定默认每个2秒 去查看一次是否有新的内容
        interval = context.getLong("interval", 2000L);

        //默认使用utf-8
        charset = context.getString("charset", "UTF-8");
    }

    /**
     * 1、创建一个线程来监听一个文件
     */

    @Override
    public synchronized void start() {

        //创建一个单线程的线程池
        pool = Executors.newSingleThreadExecutor();

        //获取一个ChannelProcessor
        final ChannelProcessor channelProcessor = getChannelProcessor();

        fileRunnable = new FileRunnable(filePath, positionFile, interval, charset, channelProcessor);

        //提交到线程池中
        pool.execute(fileRunnable);

        //调用父类的方法
        super.start();
    }

    @Override
    public synchronized void stop() {

        //停止
        fileRunnable.setFlag(false);

        //停止线程池
        pool.shutdown();

        while (!pool.isTerminated()) {
            logger.debug("Waiting for exec executor service to stop");

            try {
                //等500秒在停
                pool.awaitTermination(500, TimeUnit.MILLISECONDS);
            } catch (InterruptedException e) {
                logger.debug("Interrupted while waiting for exec executor service to stop. Just exiting.");
                Thread.currentThread().interrupt();
            }
        }
        super.stop();
    }


    private static class FileRunnable implements Runnable {

        private boolean flag = true;

        //偏移量
        private Long offset = 0L;

        private Long interval;

        private String charset;

        //可以直接从偏移量开始读取数据
        private RandomAccessFile randomAccessFile;

        //可以发送给channel的工具类
        private ChannelProcessor channelProcessor;

        private File posFile;

        public void setFlag(boolean flag) {
            this.flag = flag;
        }

        /**
         * 先于run方法执行，构造器只执行一次
         * 先看看有没有偏移量，如果有就接着读，如果没有就从头开始读
         */
        public FileRunnable(String filePath, String positionFile, Long interval, String charset, ChannelProcessor channelProcessor) {

            this.interval = interval;

            this.charset = charset;

            this.channelProcessor = channelProcessor;

            //读取偏移量,在positionFile文件
            posFile = new File(positionFile);

            if (!posFile.exists()) {
                //如果不存在就创建一个文件
                try {
                    posFile.createNewFile();
                } catch (IOException e) {
                    e.printStackTrace();
                    logger.error("create positionFile file error: 创建保存偏移量的文件时失败", e);
                }
            }

            try {
                //读取文件的偏移量
                String offsetString = FileUtils.readFileToString(posFile);

                //以前读取过
                if (offsetString != null && !"".equals(offsetString)) {
                    //把偏移量穿换成long类型
                    offset = Long.parseLong(offsetString);
                }

            } catch (IOException e) {
                e.printStackTrace();
                logger.error("read positionFile file error: 读取保存偏移量的文件时失败", e);
            }

            try {
                //按照指定的偏移量读取数据
                randomAccessFile = new RandomAccessFile(filePath, "r");
                //按照指定的偏移量读取
                randomAccessFile.seek(offset);

            } catch (FileNotFoundException e) {
                e.printStackTrace();
                logger.error("read filePath file error: 读取文件时发生错误", e);
            } catch (IOException e) {
                e.printStackTrace();
                logger.error("randomAccessFile seek error", e);
            }
        }

        @Override
        public void run() {

            while (flag) {

                //读取文件中的新数据
                try {
                    String line = randomAccessFile.readLine();
                    if (line != null) {

                        //向channel发送数据
//                      channelProcessor.processEvent(EventBuilder.withBody(line, Charset.forName(charset)));//用下面的方式替代
                        //有数据进行处理，避免出现乱码
                        line = new String(line.getBytes("iso8859-1"), charset);
                        channelProcessor.processEvent(EventBuilder.withBody(line.getBytes()));

                        //获取偏移量,更新偏移量
                        offset = randomAccessFile.getFilePointer();

                        //将偏移量写入到位置文件中
                        FileUtils.writeStringToFile(posFile, offset.toString());

                    } else {
                        //没读到睡一会儿
                        Thread.sleep(interval);
                    }
                } catch (IOException e) {
                    logger.error("read randomAccessFile error", e);
                } catch (InterruptedException e) {
                    logger.error("sleep error", e);
                }
            }
        }
    }
}

三, 打包上传到flume的lib下

jar下载位置

https://download.csdn.net/download/migumigu1316/11002934

四,测试

案例:

Flume自定义Source

#定义agent名， source、channel、sink的名称
tier1.sources = r1
tier1.channels = c1
tier1.sinks = k1

#具体定义source,这里的type是自定义的source的类的全路径,包名.类名
tier1.sources.r1.type = com.flume.source.MyFlumeSource
#这里的参数名都和自定义类的参数一致
#读取哪个文件
tier1.sources.r1.filePath = /opt/test/test_data/test01.log
#偏移量保存的文件
#切记:提前创建好保存便宜量文件,不然首次运行的时候会报错
tier1.sources.r1.positionFile = /opt/test/test_flume/myFlumeSource/logs/posi.txt
#时间间隔，每隔多久读取一次
tier1.sources.r1.interval = 2000
#编码
tier1.sources.r1.charset = UTF-8

#具体定义channel
tier1.channels.c1.type = memory
tier1.channels.c1.capacity = 1000
tier1.channels.c1.transactionCapacity = 100

#具体定义sink
tier1.sinks.k1.type = file_roll
tier1.sinks.k1.sink.directory = /opt/test/test_flume/myFlumeSource/
tier1.sinks.k1.sink.rollInterval=0

#组装source、channel、sink
tier1.sources.r1.channels = c1
tier1.sinks.k1.channel = c1

配置flume-mySource-dir-hdfs.conf

#自定义Source,channel的type采用file channel,Sink到HDFS

#定义agent名， source、channel、sink的名称
tier1.sources = r1
tier1.channels = c1
tier1.sinks = k1

#具体定义source,这里的type是自定义的source的类的全路径
tier1.sources.r1.type = com.flume.source.MyFlumeSource
#这里的参数名都和自定义类的参数一致
#读取哪个文件
tier1.sources.r1.filePath = /opt/test/test_data/test01.log
#偏移量保存的文件
#切记:提前创建好保存便宜量文件,不然首次运行的时候会报错
tier1.sources.r1.positionFile = /opt/test/test_flume/myFlumeSource/logs/offset.txt
#时间间隔，每隔多久读取一次
tier1.sources.r1.interval = 2000
#编码
tier1.sources.r1.charset = UTF-8

#具体定义channel
tier1.channels.c1.type=file
tier1.channels.c1.checkpointDir=/var/lib/flume-ng/flumedata/checkpoint
tier1.channels.c1.dataDirs=/var/lib/flume-ng/flumedata/data

#具体定义sink
tier1.sinks.k1.type= hdfs

#hdfs的路径
tier1.sinks.k1.hdfs.path= hdfs://hadoop001:8020/test_flume/mySrcFiChannel/%y-%m-%d/%H

#写入hdfs的文件名前缀
tier1.sinks.k1.hdfs.filePrefix= channel-
#写入 hdfs 的文件名后缀，比如：.lzo .log等
#tier1.sinks.k1.hdfs.fileSuffix= .log

#是否按照时间滚动文件夹
tier1.sinks.k1.hdfs.round = true

#多少时间单位创建一个新的文件夹
#默认值：1，时间上进行“舍弃”的值；
tier1.sinks.k1.hdfs.roundValue = 1

#重新定义时间单位
#默认值：seconds,时间上进行”舍弃”的单位，包含：second,minute,hour
tier1.sinks.k1.hdfs.roundUnit = hour

#是否使用本地时间戳
tier1.sinks.k1.hdfs.useLocalTimeStamp = true

#积攒多少个Event才flush到HDFS一次
#默认值：100：每个批次刷新到 HDFS 上的 events 数量；
tier1.sinks.k1.hdfs.batchSize = 100

#writeFormat：写 sequence 文件的格式。包含：Text, Writable（默认）
tier1.sinks.k1.hdfs.writeFormat = text

#设置文件类型，可支持压缩
#默认值：SequenceFile，文件格式，包括：SequenceFile, DataStream,CompressedStream
#当使用DataStream时候，文件不会被压缩，不需要设置hdfs.codeC;
#当使用CompressedStream时候，必须设置一个正确的hdfs.codeC值；
##codeC：文件压缩格式，包括：gzip, bzip2, lzo, lzop, snappy
tier1.sinks.k1.hdfs.fileType = DataStream

#多久生成一个新的文件,
#默认值：30：hdfs sink 间隔多长将临时文件滚动成最终目标文件，单位：秒；
#如果设置成0，则表示不根据时间来滚动文件
tier1.sinks.k1.hdfs.rollInterval = 60

#设置每个文件的滚动大小大概是128M
#默认值：1024：当临时文件达到多少（单位：bytes）时，滚动成目标文件；如果设置成0，则表示不根据临时文件大小来滚动文件；
tier1.sinks.k1.hdfs.rollSize = 134217700

#文件的滚动与Event数量无关
默认值：10：当 events 数据达到该数量时候，将临时文件滚动成目标文件；如果设置成0，则表示不根据events数据来滚动文件
tier1.sinks.k1.hdfs.rollCount = 0

#组装source、channel、sink
tier1.sources.r1.channels = c1
tier1.sinks.k1.channel = c1

xqg1316

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
Flume自定义Source

一、在使用flume采集日志时，可以通过flume进行监控某一个文件把生产的数据传输给指定的sink，但是如果某段时间flume所在机器宕机了，那么当重新启动后，在去监控时，会导致有数据丢失，不是接着上一次的数据继续进行读取，因此针对这种情况时可能需要我们自定义一个source，记录偏移量，每次都是接着上次继续读二、下面就是具体实现的代码再写代码时可以参照官方给的source的源码进行编写，比...
复制链接

扫一扫

专栏目录