应用场景:需要实时收集多台服务器的nginx日志到一台机器。收集完成结果存放需要按天生成文件夹,按每5分钟生成文件,比如2012年12月29日12点26分的日志,需要放到/data/log/20121229/log-1225-对应的文件中。自己实现了类似flume-og和flume-ng的hdfs-sink的文件sink。
使用的时候配置如下:
agent.sources = source agent.channels = channel agent.sinks = sink agent.sources.source.type = avro agent.sources.source.bind = 192.168.0.100 agent.sources.source.port = 44444 agent.sources.source.channels = channel agent.sinks.sink.type = org.apache.flume.sink.FileSink agent.sinks.sink.file.path = /data/log/%{dayStr} agent.sinks.sink.file.filePrefix = log-%{hourStr}%{minStr}- agent.sinks.sink.file.txnEventMax = 10000 agent.sinks.sink.file.maxOpenFiles = 5 agent.sinks.sink.channel = channel agent.channels.channel.type = memory agent.channels.channel.capacity = 100000 agent.channels.channel.transactionCapacity = 100000 agent.channels.channel.keep-alive = 60 |
依赖的jar如下:
jakarta-oro-2.0.1.jar
flume-ng-core-1.3.0-SNAPSHOT.jar
flume-ng-sdk-1.3.0-SNAPSHOT.jar
flume-ng-configuration-1.3.0-SNAPSHOT.jar
slf4j-log4j12-1.6.1.jar
slf4j-api-1.6.1.jar
guava-10.0.1.jar
代码如下:
FileSink.java
package
org.apache.flume.sink;
import
java.io.IOException;
import
java.util.Calendar;
import
java.util.List;
import
java.util.concurrent.Executors;
import
java.util.concurrent.ScheduledExecutorService;
import
org.apache.flume.Channel;
import
org.apache.flume.Context;
import
org.apache.flume.Event;
import
org.apache.flume.EventDeliveryException;
import
org.apache.flume.Transaction;
import
org.apache.flume.conf.Configurable;
import
org.apache.flume.formatter.output.BucketPath;
import
org.apache.flume.instrumentation.SinkCounter;
import
org.apache.flume.serialization.EventSerializer;
import
org.slf4j.Logger;
import
org.slf4j.LoggerFactory;
import
com.google.common.base.Preconditions;
import
com.google.common.collect.Lists;
import
com.google.common.util.concurrent.ThreadFactoryBuilder;
public
class
FileSink
extends
AbstractSink
implements
Configurable {
private
static
final
Logger
logger
= LoggerFactory
.getLogger(FileSink.
class
);
private
String
path
;
private
static
final
String
defaultFileName
=
"FlumeData"
;
private
static
final
int
defaultMaxOpenFiles
= 50;
/**
* Default length of time we wait for blocking BucketWriter calls before
* timing out the operation. Intended to prevent server hangs.
*/
private
long
txnEventMax
;
private
FileWriterLinkedHashMap
sfWriters
;
private
String
serializerType
;
private
Context
serializerContext
;
private
boolean
needRounding
=
false
;
private
int
roundUnit
= Calendar.
SECOND
;
private
int
roundValue
= 1;
private
SinkCounter
sinkCounter
;
private
int
maxOpenFiles
;
private
ScheduledExecutorService
timedRollerPool
;
private
long
rollInterval
;
@Override
public
void
configure(Context context) {
String directory = Preconditions.checkNotNull(
context.getString(
"file.path"
),
"file.path is required"
);
String fileName = context.getString(
"file.filePrefix"
,
defaultFileName
);
this
.
path
= directory +
"/"
+ fileName;
maxOpenFiles
= context.getInteger(
"file.maxOpenFiles"
,
defaultMaxOpenFiles
);
serializerType
= context.getString(
"sink.serializer"
,
"TEXT"
);
serializerContext
=
new
Context(
context.getSubProperties(EventSerializer.
CTX_PREFIX
));
txnEventMax
= context.getLong(
"file.txnEventMax"
, 1l);
if
(
sinkCounter
==
null
) {
sinkCounter
=
new
SinkCounter(getName());
}
rollInterval
= context.getLong(
"file.rollInterval"
, 30l);
String rollerName =
"hdfs-"
+ getName() +
"-roll-timer-%d"
;
timedRollerPool
= Executors.newScheduledThreadPool(
maxOpenFiles
,
new
ThreadFactoryBuilder().setNameFormat(rollerName).build());
}
@Override
public
Status process()
throws
EventDeliveryException {
Channel channel = getChannel();
Transaction transaction = channel.getTransaction();
List<BucketFileWriter> writers = Lists.newArrayList();
transaction.begin();
try
{
Event event =
null
;
int
txnEventCount = 0;
for
(txnEventCount = 0; txnEventCount <
txnEventMax
; txnEventCount++) {
event = channel.take();
if
(event ==
null
) {
break
;
}
// reconstruct the path name by substituting place holders
String realPath = BucketPath
.escapeString(
path
, event.getHeaders(),
needRounding
,
roundUnit
,
roundValue
);
BucketFileWriter bucketFileWriter =
sfWriters
.get(realPath);
// we haven't seen this file yet, so open it and cache the
// handle
if
(bucketFileWriter ==
null
) {
bucketFileWriter =
new
BucketFileWriter();
bucketFileWriter.open(realPath,
serializerType
,
serializerContext
,
rollInterval
,
timedRollerPool
,
sfWriters
);
sfWriters
.put(realPath, bucketFileWriter);
}
// track the buckets getting written in this transaction
if
(!writers.contains(bucketFileWriter)) {
writers.add(bucketFileWriter);
}
// Write the data to File
bucketFileWriter.append(event);
}
if
(txnEventCount == 0) {
sinkCounter
.incrementBatchEmptyCount();
}
else
if
(txnEventCount ==
txnEventMax
) {
sinkCounter
.incrementBatchCompleteCount();
}
else
{
sinkCounter
.incrementBatchUnderflowCount();
}
// flush all pending buckets before committing the transaction
for
(BucketFileWriter bucketFileWriter : writers) {
if
(!bucketFileWriter.isBatchComplete()) {
flush(bucketFileWriter);
}
}
transaction.commit();
if
(txnEventCount > 0) {
sinkCounter
.addToEventDrainSuccessCount(txnEventCount);
}
if
(event ==
null
) {
return
Status.
BACKOFF
;
}
return
Status.
READY
;
}
catch
(IOException eIO) {
transaction.rollback();
logger
.warn(
"File IO error"
, eIO);
return
Status.
BACKOFF
;
}
catch
(Throwable th) {
transaction.rollback();
logger
.error(
"process failed"
, th);
if
(th
instanceof
Error) {
throw
(Error) th;
}
else
{
throw
new
EventDeliveryException(th);
}
}
finally
{
transaction.close();
}
}
private
void
flush(BucketFileWriter bucketFileWriter)
throws
IOException {
bucketFileWriter.flush();
}
@Override
public
synchronized
void
start() {
super
.start();
this
.
sfWriters
=
new
FileWriterLinkedHashMap(
maxOpenFiles
);
sinkCounter
.start();
}
}
|
BucketFileWriter.java
package org.apache.flume.sink; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.util.concurrent.Callable; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; import org.apache.flume.Context; import org.apache.flume.Event; import org.apache.flume.serialization.EventSerializer; import org.apache.flume.serialization.EventSerializerFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class BucketFileWriter { private static final Logger logger = LoggerFactory .getLogger(BucketFileWriter.class); private static final String IN_USE_EXT = ".tmp"; /** * This lock ensures that only one thread can open a file at a time. */ private final AtomicLong fileExtensionCounter; private OutputStream outputStream; private EventSerializer serializer; private String filePath; /** * Close the file handle and rename the temp file to the permanent filename. * Safe to call multiple times. Logs HDFSWriter.close() exceptions. * * @throws IOException * On failure to rename if temp file exists. */ public BucketFileWriter() { fileExtensionCounter = new AtomicLong(System.currentTimeMillis()); } public void open(final String filePath, String serializerType, Context serializerContext, final long rollInterval, final ScheduledExecutorService timedRollerPool, final FileWriterLinkedHashMap sfWriters) throws IOException { this.filePath = filePath; File file = new File(filePath + fileExtensionCounter + IN_USE_EXT); file.getParentFile().mkdirs(); outputStream = new BufferedOutputStream(new FileOutputStream(file)); logger.info("filename = " + file.getAbsolutePath()); serializer = EventSerializerFactory.getInstance(serializerType, serializerContext, outputStream); serializer.afterCreate(); if (rollInterval > 0) { Callable<Void> action = new Callable<Void>() { @Override public Void call() throws Exception { logger.debug( "Rolling file ({}): Roll scheduled after {} sec elapsed.", filePath + fileExtensionCounter + IN_USE_EXT, rollInterval); if (sfWriters.containsKey(filePath)) { sfWriters.remove(filePath); } close(); return null; } }; timedRollerPool.schedule(action, rollInterval, TimeUnit.SECONDS); } } public void append(Event event) throws IOException { serializer.write(event); } public boolean isBatchComplete() { return true; } public void flush() throws IOException { serializer.flush(); outputStream.flush(); } /** * Rename bucketPath file from .tmp to permanent location. */ private void renameBucket() { File srcPath = new File(filePath + fileExtensionCounter + IN_USE_EXT); File dstPath = new File(filePath + fileExtensionCounter); if (srcPath.exists()) { srcPath.renameTo(dstPath); logger.info("Renaming " + srcPath + " to " + dstPath); } } public synchronized void close() throws IOException, InterruptedException { if (outputStream != null) { outputStream.flush(); outputStream.close(); } renameBucket(); } } |
FileWriterLinkedHashMap.java
package org.apache.flume.sink; import java.io.IOException; import java.util.LinkedHashMap; import java.util.Map.Entry; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class FileWriterLinkedHashMap extends LinkedHashMap<String, BucketFileWriter> { private static final Logger logger = LoggerFactory .getLogger(FileWriterLinkedHashMap.class); private static final long serialVersionUID = -7860596835613215998L; private final int maxOpenFiles; public FileWriterLinkedHashMap(int maxOpenFiles) { super(16, 0.75f, true); // stock initial capacity/load, access this.maxOpenFiles = maxOpenFiles; } @Override protected boolean removeEldestEntry(Entry<String, BucketFileWriter> eldest) { if (size() > maxOpenFiles) { // If we have more that max open files, then close the last one // and // return true try { eldest.getValue().close(); } catch (IOException e) { logger.warn(eldest.getKey().toString(), e); } catch (InterruptedException e) { logger.warn(eldest.getKey().toString(), e); Thread.currentThread().interrupt(); } return true; } else { return false; } } } |