flume接收kafka source落地本地,然后上传hdfs,避免flume直接上传hdfs
maven工程
pom.xml
<version.flume>1.7.0</version.flume>
<dependency>
<groupId>org.apache.flume</groupId>
<artifactId>flume-ng-core</artifactId>
<version>${version.flume}</version>
</dependency>
<dependency>
<groupId>org.apache.flume</groupId>
<artifactId>flume-ng-configuration</artifactId>
<version>${version.flume}</version>
</dependency>
java
package com.qunar.qav.flume;
/**
* Created by lifei on 2018/7/31.
*/
import java.io.File;
import java.util.concurrent.atomic.AtomicInteger;
public class PathManagerExtra {
private long seriesTimestamp;
private String baseDirectory;
private AtomicInteger fileIndex;
private File currentFile;
private String pefix;
private String suffix;
public PathManagerExtra() {
seriesTimestamp = System.currentTimeMillis();
fileIndex = new AtomicInteger();
}
public File nextFile() {
//(1) /usr/local/flume/xxxxpjmLog/%Y%m%d 将%Y%m%d替换为年月日 并返回(此处为省事整串替换,配置文件中的也必须写成%Y%m%d<span style="font-family: Arial, Helvetica, sans-serif;">)</span>
String dirStr = SinkPjmDefinedUtils.getRealPath(baseDirectory);
//(2) flume_bjxd02.%Y%m%d%H%M将%Y%m%d%H%M替换为年月日时分
String pefixStr = SinkPjmDefinedUtils.getRealPathFilePrefix(pefix);
//(3) 拼文件全路径/data/logs/flume/allpjm/20150115/flume_bjxd02.201501151029.1421288975655.log
// (写文件中需要添加.tmp后缀)
String filePath = dirStr+pefixStr+"."+System.currentTimeMillis()+suffix+".tmp";
currentFile = SinkPjmDefinedUtils.CreateFolderAndFile(dirStr, filePath);
return currentFile;
}
/* public File nextFile() {
currentFile = new File(baseDirectory, seriesTimestamp + "-"
+ fileIndex.incrementAndGet());
return currentFile;
}
*/
public File getCurrentFile() {
if (currentFile == null) {
return nextFile();
}
return currentFile;
}
public void rotate() {
currentFile = null;
}
public String getBaseDirectory() {
return baseDirectory;
}
public void setBaseDirectory(String baseDirectory) {
this.baseDirectory = baseDirectory;
}
public long getSeriesTimestamp() {
return seriesTimestamp;
}
public AtomicInteger getFileIndex() {
return fileIndex;
}
public String getPefix() {
return pefix;
}
public void setPefix(String pefix) {
this.pefix = pefix;
}
public String getSuffix() {
return suffix;
}
public void setSuffix(String suffix) {
this.suffix = suffix;
}
}
package com.qunar.qav.flume;
import java.io.File;
import java.text.SimpleDateFormat;
import java.util.Date;
/**
* Created by lifei on 2018/7/31.
*/
public class SinkPjmDefinedUtils {
/**
* 功能:替换文件夹路径中的%Y%m%d <br/>
*
* @author pjm <br/>
* @version 2015-1-15 上午09:44:46 <br/>
*/
public static String getRealPath(String path){
if (path.contains("%Y%m%d%H")) {
Date today = new Date();
SimpleDateFormat formatter = new SimpleDateFormat("yyyyMMddHH");
String formattedDate = formatter.format(today);
System.out.println(formattedDate);
path = path.replace("%Y%m%d%H", formattedDate);
}
return path;
}
/**
* 功能: 文件前缀替换<br/>
*
* @author pjm <br/>
* @version 2015-1-15 上午09:45:32 <br/>
*/
public static String getRealPathFilePrefix(String path){
if (path.contains("%Y%m%d%H%M")) {
Date today = new Date();
SimpleDateFormat formatter = new SimpleDateFormat("yyyyMMddHHmm");
String formattedDate = formatter.format(today);
System.out.println(formattedDate);
path = path.replace("%Y%m%d%H%M", formattedDate);
}
return path;
}
/**
* 功能: 创建文件和文件夹,并返回文件<br/>
*
* @author pjm <br/>
* @version 2015-1-15 上午09:45:48 <br/>
*/
public static File CreateFolderAndFile(String dirpath,String filepath){
//String dirpath = "/data/logs/flume/All/20150115/";
//String filepath = "/data/logs/flume/All/20150115/flume_bjxd04.201501150900.1421283612463.log";
//String dirpath = "/usr/local/flume/AllLog/20150115/";
//String filepath = "/usr/local/flume/AllLog/20150115/flume_bjxd04.201501150900.1421283612463.log";
File dirFile = new File(dirpath);
// 创建文件夹
if (!dirFile.exists()) {
dirFile.mkdirs();
}
File f = new File(filepath);
/* // 创建文件
if (!f.exists()) {
try {
f.createNewFile();
// f.createTempFile("kkk2", ".java", dirFile);
} catch (IOException e) {
e.printStackTrace();
}
}*/
return f;
}
}
package com.qunar.qav.flume;
import com.google.common.base.Preconditions;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import org.apache.commons.io.FileUtils;
import org.apache.flume.*;
import org.apache.flume.conf.Configurable;
import org.apache.flume.instrumentation.SinkCounter;
import org.apache.flume.serialization.EventSerializer;
import org.apache.flume.serialization.EventSerializerFactory;
import org.apache.flume.sink.AbstractSink;
import org.apache.flume.sink.RollingFileSink;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
/**
* Created by lifei on 2018/7/31.
*/
public class RollingFileSinkExtra extends AbstractSink implements
Configurable {
private static final Logger logger = LoggerFactory
.getLogger(RollingFileSink.class);
private static final long defaultRollInterval = 30;
private static final int defaultBatchSize = 100;
private int batchSize = defaultBatchSize;
private String directory; //在 RollingFileSink类 是 private File directory; 因为此处需要替换 年月日等 定义为String
private long rollInterval;
private OutputStream outputStream;
private ScheduledExecutorService rollService;
private String serializerType;
private Context serializerContext;
private EventSerializer serializer;
private SinkCounter sinkCounter;
private PathManagerExtra pathController;
private volatile boolean shouldRotate;
private String pefix;
private String suffix;
public RollingFileSinkExtra() {
pathController = new PathManagerExtra();
shouldRotate = false;
}
@Override
public void configure(Context context) {
// <span style="white-space:pre"> //获取配置参数sink.directory</span> sink.rollInterval sink.filePrefix sink.fileSuffix
directory = context.getString("sink.directory");
String rollInterval = context.getString("sink.rollInterval");
pefix = context.getString("sink.filePrefix");
suffix = context.getString("sink.fileSuffix");
serializerType = context.getString("sink.serializer", "TEXT");
serializerContext = new Context(context.getSubProperties("sink."
+ EventSerializer.CTX_PREFIX));
Preconditions.checkArgument(directory != null,
"Directory may not be null");
Preconditions.checkNotNull(serializerType,
"Serializer type is undefined");
if (rollInterval == null) {
this.rollInterval = defaultRollInterval;
} else {
this.rollInterval = Long.parseLong(rollInterval);
}
batchSize = context.getInteger("sink.batchSize", defaultBatchSize);
if (sinkCounter == null) {
sinkCounter = new SinkCounter(getName());
}
}
@Override
public void start() {
logger.info("Starting {}...", this);
sinkCounter.start();
super.start();
pathController.setBaseDirectory(directory);
pathController.setPefix(pefix);
pathController.setSuffix(suffix);
if (rollInterval > 0) {
rollService = Executors.newScheduledThreadPool(
1,
new ThreadFactoryBuilder().setNameFormat(
"rollingFileSink-roller-"
+ Thread.currentThread().getId() + "-%d")
.build());
/*
* Every N seconds, mark that it's time to rotate. We purposefully
* do NOT touch anything other than the indicator flag to avoid
* error handling issues (e.g. IO exceptions occuring in two
* different threads. Resist the urge to actually perform rotation
* in a separate thread!
*/
rollService.scheduleAtFixedRate(new Runnable() {
@Override
public void run() {
logger.debug("Marking time to rotate file {}",
pathController.getCurrentFile());
shouldRotate = true;
}
}, rollInterval, rollInterval, TimeUnit.SECONDS);
} else {
logger.info("RollInterval is not valid, file rolling will not happen.");
}
logger.info("RollingFileSink {} started.", getName());
}
@Override
public Status process() throws EventDeliveryException {
if (shouldRotate) { // shouldRotate为真,表示当前文件停止Roll,再生成新的文件执行写入
logger.debug("Time to rotate {}", pathController.getCurrentFile());
if (outputStream != null) {
logger.debug("Closing file {}", pathController.getCurrentFile());
try {
serializer.flush();
serializer.beforeClose();
outputStream.close();
sinkCounter.incrementConnectionClosedCount();
shouldRotate = false;
} catch (Exception e) {
sinkCounter.incrementConnectionFailedCount();
throw new EventDeliveryException("Unable to rotate file "
+ pathController.getCurrentFile()
+ " while delivering event", e);
} finally {
serializer = null;
outputStream = null;
}
去掉文件后缀名(文件在写入的过程中默认给加了.tmp作为区分,文件写完需要去掉这个后缀)
File ff = pathController.getCurrentFile();
try {
FileUtils.moveFile( ff, new File(ff.getAbsolutePath().substring(0, ff.getAbsolutePath().indexOf(".tmp"))));
} catch (IOException e) {
e.printStackTrace();
}
pathController.rotate();
}
}
if (outputStream == null) {
File currentFile = pathController.getCurrentFile();
logger.debug("Opening output stream for file {}", currentFile);
try {
outputStream = new BufferedOutputStream(new FileOutputStream(
currentFile));
serializer = EventSerializerFactory.getInstance(serializerType,
serializerContext, outputStream);
serializer.afterCreate();
sinkCounter.incrementConnectionCreatedCount();
} catch (IOException e) {
sinkCounter.incrementConnectionFailedCount();
throw new EventDeliveryException("Failed to open file "
+ pathController.getCurrentFile()
+ " while delivering event", e);
}
}
Channel channel = getChannel();
Transaction transaction = channel.getTransaction();
Event event = null;
Status result = Status.READY;
try {
transaction.begin();
int eventAttemptCounter = 0;
for (int i = 0; i < batchSize; i++) {
event = channel.take();
if (event != null) {
sinkCounter.incrementEventDrainAttemptCount();
eventAttemptCounter++;
serializer.write(event);
/*
* FIXME: Feature: Rotate on size and time by checking bytes
* written and setting shouldRotate = true if we're past a
* threshold.
*/
/*
* FIXME: Feature: Control flush interval based on time or
* number of events. For now, we're super-conservative and
* flush on each write.
*/
} else {
// No events found, request back-off semantics from runner
result = Status.BACKOFF;
break;
}
}
serializer.flush();
outputStream.flush();
transaction.commit();
sinkCounter.addToEventDrainSuccessCount(eventAttemptCounter);
} catch (Exception ex) {
transaction.rollback();
throw new EventDeliveryException("Failed to process transaction",
ex);
} finally {
transaction.close();
}
return result;
}
@Override
public void stop() {
logger.info("RollingFile sink {} stopping...", getName());
sinkCounter.stop();
super.stop();
if (outputStream != null) {
logger.debug("Closing file {}", pathController.getCurrentFile());
try {
serializer.flush();
serializer.beforeClose();
outputStream.close();
sinkCounter.incrementConnectionClosedCount();
} catch (IOException e) {
sinkCounter.incrementConnectionFailedCount();
logger.error(
"Unable to close output stream. Exception follows.", e);
} finally {
outputStream = null;
serializer = null;
}
}
if (rollInterval > 0) {
rollService.shutdown();
while (!rollService.isTerminated()) {
try {
rollService.awaitTermination(1, TimeUnit.SECONDS);
} catch (InterruptedException e) {
logger.debug(
"Interrupted while waiting for roll service to stop. "
+ "Please report this.", e);
}
}
}
logger.info("RollingFile sink {} stopped. Event metrics: {}",
getName(), sinkCounter);
}
public String getDirectory() {
return directory;
}
public void setDirectory(String directory) {
this.directory = directory;
}
public long getRollInterval() {
return rollInterval;
}
public void setRollInterval(long rollInterval) {
this.rollInterval = rollInterval;
}
}
flume配置
老版本kakfa zookeeperConnect配置
agent.sources = source1
agent.channels = memoryChannel
agent.sinks = k1
#source
agent.sources.source1.type = org.apache.flume.source.kafka.KafkaSource
agent.sources.source1.zookeeperConnect = xxxxxxxxx.com:2181
agent.sources.source1.topic = custom_wireless_m_pub_loganalysts
agent.sources.source1.groupId = test-group3
agent.sources.source1.batchSize = 1000
agent.sources.source1.batchDurationMillis = 1000
#channel
agent.channels.memoryChannel.type = memory
agent.channels.memoryChannel.capacity = 10000
agent.channels.memoryChannel.transactionCapacity = 10000
#sink
#agent.sinks.k1.type = file_roll
#agent.sinks.k1.channel = c1
#agent.sinks.k1.sink.directory = /home/q/performance/apache-flume-1.7.0-bin/testdir/
agent.sinks.k1.type = com.qunar.qav.flume.RollingFileSinkExtra
agent.sinks.k1.sink.directory = /home/q/performance/shell/data/%Y%m%d%H/
agent.sinks.k1.sink.filePrefix = performance.%Y%m%d%H%M
agent.sinks.k1.sink.fileSuffix = .log
agent.sinks.k1.sink.rollInterval = 60
#assemble
agent.sources.source1.channels = memoryChannel
agent.sinks.k1.channel = memoryChannel
启动命令
bin/flume-ng agent --conf conf/ --conf-file conf/file.conf --name agent -Dflum
e.root.logger=INFO,console > run.log 2>&1 &
zookeeper-3.3.6.jar
hadoop-yarn-common-2.1.0-beta.jar
hadoop-yarn-api-2.1.0-beta.jar
hadoop-mapreduce-client-core-2.1.0-beta.jar
hadoop-common-2.2.0.jar
hadoop-auth-2.2.0.jar
hadoop-annotations-2.2.0.jar
commons-configuration-1.6.jar
hadoop-hdfs-2.2.0.jar
udf-1.0.jar
put2hdfs.sh
#!/bin/bash
source /etc/profile
dt="$(date -d "$1 3 min ago " +'%Y-%m-%d')"
h="$(date -d "$1 3 min ago " +'%H')"
hour="$(date -d "$1 3 min ago " +'%Y%m%d%H')"
min="$(date -d "$1 3 min ago " +'%Y%m%d%H%M')"
pt="/home/q/performance/shell/data/${hour}"
chmod a+w ${pt}
cd $pt
pwd
context=`ls $pt | grep "performance.${min}"`
if [ "$context" = "" ];then
echo "$context not exists!!!! skip"
exit 1
fi
echo ">>>>>>>>>>>>>>>>process: $pt/$context>>>>>>>>>>>>>>>>>>"
#获取文件的大小
FILE_SIZE=`ls -l $context | awk '{print $5}' `
echo ">>>>>>>>>>size: $FILE_SIZE<<<<<<<<<<"
#判断文件大小 如果为0直接删除
if [ $FILE_SIZE -ne 0 ];then
#压缩文件
gzip $context
SQL=" LOAD DATA LOCAL INPATH '${pt}/${context}.gz' INTO TABLE orig_performance_all PARTITION (dt='${dt}',hour='${h}') "
echo "$SQL"
hive -e "use wirelessdata;${SQL};" || exit 1
fi
rm -rf ${context}.gz
echo ">>>>>>>>>>>>>>>>>>done>>>>>>>>>>>>>>>>>"
exit 0