flume接收kafka source落地本地

flume接收kafka source落地本地,然后上传hdfs,避免flume直接上传hdfs
maven工程
pom.xml

<version.flume>1.7.0</version.flume>
<dependency>
	<groupId>org.apache.flume</groupId>
	<artifactId>flume-ng-core</artifactId>
	<version>${version.flume}</version>
</dependency>
<dependency>
	<groupId>org.apache.flume</groupId>
	<artifactId>flume-ng-configuration</artifactId>
	<version>${version.flume}</version>
</dependency>

java

package com.qunar.qav.flume;

/**
 * Created by lifei on 2018/7/31.
 */
import java.io.File;
import java.util.concurrent.atomic.AtomicInteger;
public class PathManagerExtra {
    private long seriesTimestamp;
    private String baseDirectory;
    private AtomicInteger fileIndex;

    private File currentFile;

    private String pefix;
    private String suffix;


    public PathManagerExtra() {
        seriesTimestamp = System.currentTimeMillis();
        fileIndex = new AtomicInteger();
    }


    public File nextFile() {
        //(1)  /usr/local/flume/xxxxpjmLog/%Y%m%d 将%Y%m%d替换为年月日 并返回(此处为省事整串替换,配置文件中的也必须写成%Y%m%d<span style="font-family: Arial, Helvetica, sans-serif;">)</span>
        String dirStr = SinkPjmDefinedUtils.getRealPath(baseDirectory);
        //(2)  flume_bjxd02.%Y%m%d%H%M将%Y%m%d%H%M替换为年月日时分
        String pefixStr = SinkPjmDefinedUtils.getRealPathFilePrefix(pefix);
        //(3)  拼文件全路径/data/logs/flume/allpjm/20150115/flume_bjxd02.201501151029.1421288975655.log
        //    (写文件中需要添加.tmp后缀)
        String filePath = dirStr+pefixStr+"."+System.currentTimeMillis()+suffix+".tmp";
        currentFile = SinkPjmDefinedUtils.CreateFolderAndFile(dirStr, filePath);

        return currentFile;
    }
    /* public File nextFile() {
       currentFile = new File(baseDirectory, seriesTimestamp + "-"
           + fileIndex.incrementAndGet());
       return currentFile;
     }
   */
    public File getCurrentFile() {
        if (currentFile == null) {
            return nextFile();
        }

        return currentFile;
    }
    public void rotate() {
        currentFile = null;
    }
    public String getBaseDirectory() {
        return baseDirectory;
    }
    public void setBaseDirectory(String baseDirectory) {
        this.baseDirectory = baseDirectory;
    }
    public long getSeriesTimestamp() {
        return seriesTimestamp;
    }
    public AtomicInteger getFileIndex() {
        return fileIndex;
    }


    public String getPefix() {
        return pefix;
    }


    public void setPefix(String pefix) {
        this.pefix = pefix;
    }
    public String getSuffix() {
        return suffix;
    }
    public void setSuffix(String suffix) {
        this.suffix = suffix;
    }
}



package com.qunar.qav.flume;

import java.io.File;
import java.text.SimpleDateFormat;
import java.util.Date;

/**
 * Created by lifei on 2018/7/31.
 */
public class SinkPjmDefinedUtils {

    /**
     * 功能:替换文件夹路径中的%Y%m%d <br/>
     *
     * @author pjm <br/>
     * @version 2015-1-15 上午09:44:46 <br/>
     */
    public static String getRealPath(String path){
        if (path.contains("%Y%m%d%H")) {
            Date today = new Date();
            SimpleDateFormat formatter = new SimpleDateFormat("yyyyMMddHH");
            String formattedDate = formatter.format(today);
            System.out.println(formattedDate);
            path = path.replace("%Y%m%d%H", formattedDate);
        }
        return path;
    }

    /**
     * 功能: 文件前缀替换<br/>
     *
     * @author pjm <br/>
     * @version 2015-1-15 上午09:45:32 <br/>
     */
    public static String getRealPathFilePrefix(String path){
        if (path.contains("%Y%m%d%H%M")) {
            Date today = new Date();
            SimpleDateFormat formatter = new SimpleDateFormat("yyyyMMddHHmm");
            String formattedDate = formatter.format(today);
            System.out.println(formattedDate);
            path = path.replace("%Y%m%d%H%M", formattedDate);
        }
        return path;
    }

    /**
     * 功能: 创建文件和文件夹,并返回文件<br/>
     *
     * @author pjm <br/>
     * @version 2015-1-15 上午09:45:48 <br/>
     */
    public static File CreateFolderAndFile(String dirpath,String filepath){

//String dirpath  = "/data/logs/flume/All/20150115/";
//String filepath = "/data/logs/flume/All/20150115/flume_bjxd04.201501150900.1421283612463.log";

//String dirpath  = "/usr/local/flume/AllLog/20150115/";
//String filepath = "/usr/local/flume/AllLog/20150115/flume_bjxd04.201501150900.1421283612463.log";

        File dirFile = new File(dirpath);
        // 创建文件夹
        if (!dirFile.exists()) {
            dirFile.mkdirs();
        }
        File f = new File(filepath);
/*		// 创建文件
		if (!f.exists()) {
			try {
				f.createNewFile();
//				f.createTempFile("kkk2", ".java", dirFile);
			} catch (IOException e) {
				e.printStackTrace();
			}
		}*/
        return f;
    }

}

package com.qunar.qav.flume;

import com.google.common.base.Preconditions;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import org.apache.commons.io.FileUtils;
import org.apache.flume.*;
import org.apache.flume.conf.Configurable;
import org.apache.flume.instrumentation.SinkCounter;
import org.apache.flume.serialization.EventSerializer;
import org.apache.flume.serialization.EventSerializerFactory;
import org.apache.flume.sink.AbstractSink;
import org.apache.flume.sink.RollingFileSink;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;

/**
 * Created by lifei on 2018/7/31.
 */

public class RollingFileSinkExtra extends AbstractSink implements
        Configurable {

    private static final Logger logger = LoggerFactory
            .getLogger(RollingFileSink.class);
    private static final long defaultRollInterval = 30;
    private static final int defaultBatchSize = 100;

    private int batchSize = defaultBatchSize;

    private String directory;  //在 RollingFileSink类 是 private File directory; 因为此处需要替换 年月日等 定义为String
    private long rollInterval;
    private OutputStream outputStream;
    private ScheduledExecutorService rollService;

    private String serializerType;
    private Context serializerContext;
    private EventSerializer serializer;

    private SinkCounter sinkCounter;

    private PathManagerExtra pathController;
    private volatile boolean shouldRotate;

    private String pefix;
    private String suffix;

    public RollingFileSinkExtra() {
        pathController = new PathManagerExtra();
        shouldRotate = false;
    }


    @Override
    public void configure(Context context) {
//        <span style="white-space:pre">		//获取配置参数sink.directory</span>  sink.rollInterval  sink.filePrefix  sink.fileSuffix
        directory = context.getString("sink.directory");
        String rollInterval = context.getString("sink.rollInterval");
        pefix = context.getString("sink.filePrefix");
        suffix = context.getString("sink.fileSuffix");

        serializerType = context.getString("sink.serializer", "TEXT");
        serializerContext = new Context(context.getSubProperties("sink."
                + EventSerializer.CTX_PREFIX));

        Preconditions.checkArgument(directory != null,
                "Directory may not be null");
        Preconditions.checkNotNull(serializerType,
                "Serializer type is undefined");

        if (rollInterval == null) {
            this.rollInterval = defaultRollInterval;
        } else {
            this.rollInterval = Long.parseLong(rollInterval);
        }

        batchSize = context.getInteger("sink.batchSize", defaultBatchSize);

        if (sinkCounter == null) {
            sinkCounter = new SinkCounter(getName());
        }
    }

    @Override
    public void start() {
        logger.info("Starting {}...", this);
        sinkCounter.start();
        super.start();

        pathController.setBaseDirectory(directory);
        pathController.setPefix(pefix);
        pathController.setSuffix(suffix);

        if (rollInterval > 0) {

            rollService = Executors.newScheduledThreadPool(
                    1,
                    new ThreadFactoryBuilder().setNameFormat(
                            "rollingFileSink-roller-"
                                    + Thread.currentThread().getId() + "-%d")
                            .build());

			/*
			 * Every N seconds, mark that it's time to rotate. We purposefully
			 * do NOT touch anything other than the indicator flag to avoid
			 * error handling issues (e.g. IO exceptions occuring in two
			 * different threads. Resist the urge to actually perform rotation
			 * in a separate thread!
			 */
            rollService.scheduleAtFixedRate(new Runnable() {

                @Override
                public void run() {
                    logger.debug("Marking time to rotate file {}",
                            pathController.getCurrentFile());
                    shouldRotate = true;
                }

            }, rollInterval, rollInterval, TimeUnit.SECONDS);
        } else {
            logger.info("RollInterval is not valid, file rolling will not happen.");
        }
        logger.info("RollingFileSink {} started.", getName());
    }

    @Override
    public Status process() throws EventDeliveryException {
        if (shouldRotate) {   // shouldRotate为真,表示当前文件停止Roll,再生成新的文件执行写入
            logger.debug("Time to rotate {}", pathController.getCurrentFile());

            if (outputStream != null) {
                logger.debug("Closing file {}", pathController.getCurrentFile());

                try {
                    serializer.flush();
                    serializer.beforeClose();
                    outputStream.close();
                    sinkCounter.incrementConnectionClosedCount();
                    shouldRotate = false;
                } catch (Exception e) {
                    sinkCounter.incrementConnectionFailedCount();
                    throw new EventDeliveryException("Unable to rotate file "
                            + pathController.getCurrentFile()
                            + " while delivering event", e);
                } finally {
                    serializer = null;
                    outputStream = null;
                }

                去掉文件后缀名(文件在写入的过程中默认给加了.tmp作为区分,文件写完需要去掉这个后缀)
                File ff = pathController.getCurrentFile();
                try {
                    FileUtils.moveFile( ff, new File(ff.getAbsolutePath().substring(0, ff.getAbsolutePath().indexOf(".tmp"))));
                } catch (IOException e) {
                    e.printStackTrace();
                }
                pathController.rotate();
            }
        }

        if (outputStream == null) {
            File currentFile = pathController.getCurrentFile();
            logger.debug("Opening output stream for file {}", currentFile);
            try {
                outputStream = new BufferedOutputStream(new FileOutputStream(
                        currentFile));
                serializer = EventSerializerFactory.getInstance(serializerType,
                        serializerContext, outputStream);
                serializer.afterCreate();
                sinkCounter.incrementConnectionCreatedCount();
            } catch (IOException e) {
                sinkCounter.incrementConnectionFailedCount();
                throw new EventDeliveryException("Failed to open file "
                        + pathController.getCurrentFile()
                        + " while delivering event", e);
            }
        }

        Channel channel = getChannel();
        Transaction transaction = channel.getTransaction();
        Event event = null;
        Status result = Status.READY;

        try {
            transaction.begin();
            int eventAttemptCounter = 0;
            for (int i = 0; i < batchSize; i++) {
                event = channel.take();
                if (event != null) {
                    sinkCounter.incrementEventDrainAttemptCount();
                    eventAttemptCounter++;
                    serializer.write(event);

					/*
					 * FIXME: Feature: Rotate on size and time by checking bytes
					 * written and setting shouldRotate = true if we're past a
					 * threshold.
					 */

					/*
					 * FIXME: Feature: Control flush interval based on time or
					 * number of events. For now, we're super-conservative and
					 * flush on each write.
					 */
                } else {
                    // No events found, request back-off semantics from runner
                    result = Status.BACKOFF;
                    break;
                }
            }
            serializer.flush();
            outputStream.flush();
            transaction.commit();
            sinkCounter.addToEventDrainSuccessCount(eventAttemptCounter);


        } catch (Exception ex) {
            transaction.rollback();
            throw new EventDeliveryException("Failed to process transaction",
                    ex);
        } finally {
            transaction.close();
        }

        return result;
    }

    @Override
    public void stop() {
        logger.info("RollingFile sink {} stopping...", getName());
        sinkCounter.stop();
        super.stop();

        if (outputStream != null) {
            logger.debug("Closing file {}", pathController.getCurrentFile());

            try {
                serializer.flush();
                serializer.beforeClose();
                outputStream.close();
                sinkCounter.incrementConnectionClosedCount();


            } catch (IOException e) {
                sinkCounter.incrementConnectionFailedCount();
                logger.error(
                        "Unable to close output stream. Exception follows.", e);
            } finally {
                outputStream = null;
                serializer = null;
            }
        }
        if (rollInterval > 0) {
            rollService.shutdown();

            while (!rollService.isTerminated()) {
                try {
                    rollService.awaitTermination(1, TimeUnit.SECONDS);
                } catch (InterruptedException e) {
                    logger.debug(
                            "Interrupted while waiting for roll service to stop. "
                                    + "Please report this.", e);
                }
            }
        }
        logger.info("RollingFile sink {} stopped. Event metrics: {}",
                getName(), sinkCounter);
    }

    public String getDirectory() {
        return directory;
    }

    public void setDirectory(String directory) {
        this.directory = directory;
    }

    public long getRollInterval() {
        return rollInterval;
    }

    public void setRollInterval(long rollInterval) {
        this.rollInterval = rollInterval;
    }

}

flume配置
老版本kakfa zookeeperConnect配置

agent.sources = source1
agent.channels = memoryChannel
agent.sinks = k1


#source
agent.sources.source1.type = org.apache.flume.source.kafka.KafkaSource
agent.sources.source1.zookeeperConnect = xxxxxxxxx.com:2181
agent.sources.source1.topic = custom_wireless_m_pub_loganalysts
agent.sources.source1.groupId = test-group3
agent.sources.source1.batchSize = 1000
agent.sources.source1.batchDurationMillis = 1000

#channel
agent.channels.memoryChannel.type = memory
agent.channels.memoryChannel.capacity = 10000
agent.channels.memoryChannel.transactionCapacity = 10000

#sink
#agent.sinks.k1.type = file_roll
#agent.sinks.k1.channel = c1
#agent.sinks.k1.sink.directory = /home/q/performance/apache-flume-1.7.0-bin/testdir/

agent.sinks.k1.type = com.qunar.qav.flume.RollingFileSinkExtra
agent.sinks.k1.sink.directory = /home/q/performance/shell/data/%Y%m%d%H/
agent.sinks.k1.sink.filePrefix = performance.%Y%m%d%H%M
agent.sinks.k1.sink.fileSuffix = .log
agent.sinks.k1.sink.rollInterval = 60

#assemble
agent.sources.source1.channels = memoryChannel
agent.sinks.k1.channel = memoryChannel

启动命令

bin/flume-ng agent --conf conf/ --conf-file conf/file.conf --name agent  -Dflum
e.root.logger=INFO,console > run.log 2>&1 &

zookeeper-3.3.6.jar
hadoop-yarn-common-2.1.0-beta.jar
hadoop-yarn-api-2.1.0-beta.jar
hadoop-mapreduce-client-core-2.1.0-beta.jar
hadoop-common-2.2.0.jar
hadoop-auth-2.2.0.jar
hadoop-annotations-2.2.0.jar
commons-configuration-1.6.jar
hadoop-hdfs-2.2.0.jar
udf-1.0.jar

put2hdfs.sh

#!/bin/bash
source /etc/profile
dt="$(date -d "$1 3 min ago " +'%Y-%m-%d')"
h="$(date -d "$1 3 min ago " +'%H')"
hour="$(date -d "$1 3 min ago " +'%Y%m%d%H')"
min="$(date -d "$1 3 min ago " +'%Y%m%d%H%M')"
pt="/home/q/performance/shell/data/${hour}"
chmod a+w  ${pt}
cd $pt
pwd

context=`ls $pt | grep "performance.${min}"`
if [ "$context" = "" ];then
        echo "$context not exists!!!! skip"
        exit 1
fi
echo ">>>>>>>>>>>>>>>>process: $pt/$context>>>>>>>>>>>>>>>>>>"
#获取文件的大小
FILE_SIZE=`ls -l $context | awk '{print $5}' `
echo ">>>>>>>>>>size: $FILE_SIZE<<<<<<<<<<"
#判断文件大小 如果为0直接删除
if [ $FILE_SIZE -ne 0 ];then
    #压缩文件
    gzip $context
    SQL=" LOAD DATA LOCAL INPATH '${pt}/${context}.gz' INTO TABLE orig_performance_all PARTITION (dt='${dt}',hour='${h}') "
    echo "$SQL"
    hive -e "use wirelessdata;${SQL};" || exit 1
fi
rm -rf ${context}.gz
echo ">>>>>>>>>>>>>>>>>>done>>>>>>>>>>>>>>>>>"
exit 0
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值