TailDirSource继承了AbstractSource类,而AbstractSource类中channelProcessor属性负责将Source中的Event提交给Channel组件
TailDirSource类通过配置参数匹配日志文件,获取日志文件更新内容并且将已经读取的偏移量记录到特定的文件当中(position file)
类图:
TailDirSource 启动的时候, 首先需要进行初始化,然后调用 start 方法.
为了单独测试,我直接用测试用例进行测试. 配置参数与代码如下:
@Test
public void testRun() throws InterruptedException {
source = new TaildirSource();
channel = new MemoryChannel();
Map<String, String> parameters = new HashMap<>();
parameters.put("positionFile","/todo/flume/taildir/taildir_position.json") ;
parameters.put("channels","c1") ;
parameters.put("filegroups.f1","/todo/flume/taildir/input/data.log") ;
parameters.put("filegroups","f1") ;
parameters.put("fileHeader","true") ;
parameters.put("type","TAILDIR") ;
parameters.put("headers.f1.headerKey1","markHeaderKey") ;
Configurables.configure(channel, new Context());
List<Channel> channels = new ArrayList<Channel>();
channels.add(channel);
ChannelSelector rcs = new ReplicatingChannelSelector();
rcs.setChannels(channels);
source.setChannelProcessor(new ChannelProcessor(rcs));
source.configure(new Context(parameters));
source.start();
Thread.sleep(1000000);
}
点击运行, 直接进行初始化,根据配置文件进行初始化操作.
org.apache.flume.source.taildir.TaildirSource#configure
/**
*
* @param context
*/
@Override
public synchronized void configure(Context context) {
//todo 以空格分隔的文件组列表。每个文件组都指示一组要挂起的文件。
String fileGroups = context.getString(FILE_GROUPS);
Preconditions.checkState(fileGroups != null, "Missing param: " + FILE_GROUPS);
//todo 返回一个group对应FilePath的Map<String,String>
filePaths = selectByKeys(context.getSubProperties(FILE_GROUPS_PREFIX),
fileGroups.split("\\s+"));
//todo 判断文件路径是否为空
Preconditions.checkState(!filePaths.isEmpty(),
"Mapping for tailing files is empty or invalid: '" + FILE_GROUPS_PREFIX + "'");
//todo 获取当前用户主目录
String homePath = System.getProperty("user.home").replace('\\', '/');
// todo 获取positionFile 路径,带默认值
// todo 默认: /var/log/flume/taildir_position.json
positionFilePath = context.getString(POSITION_FILE, homePath + DEFAULT_POSITION_FILE);
//todo positionFile路径
Path positionFile = Paths.get(positionFilePath);
try {
//todo 创建目录目录名,上级目录如果缺失一起创建
Files.createDirectories(positionFile.getParent());
} catch (IOException e) {
throw new FlumeException("Error creating positionFile parent directories", e);
}
//todo 用于发送EVENT的header信息添加值
//todo 返回table 结构
headerTable = getTable(context, HEADERS_PREFIX);
// todo 批量大小
batchSize = context.getInteger(BATCH_SIZE, DEFAULT_BATCH_SIZE);
// todo 从头还是从尾部读取,默认false
skipToEnd = context.getBoolean(SKIP_TO_END, DEFAULT_SKIP_TO_END);
// todo 是否加偏移量,剔除行标题 默认 false
byteOffsetHeader = context.getBoolean(BYTE_OFFSET_HEADER, DEFAULT_BYTE_OFFSET_HEADER);
// todo idleTimeout日志文件在idleTimeout间隔时间,没有被修改,文件将被关闭 默认值: 120000
idleTimeout = context.getInteger(IDLE_TIMEOUT, DEFAULT_IDLE_TIMEOUT);
// todo writePosInterval,TaildirSource读取每个监控文件都在位置文件中记录监控文件的已经读取的偏移量,
// todo writePosInterval 更新positionFile的间隔时间 默认值: 3000
writePosInterval = context.getInteger(WRITE_POS_INTERVAL, DEFAULT_WRITE_POS_INTERVAL);
// todo 是否开启matcher cache 默认: true
cachePatternMatching = context.getBoolean(CACHE_PATTERN_MATCHING,
DEFAULT_CACHE_PATTERN_MATCHING);
// todo 当最后一次尝试没有找到任何新数据时,推迟变量长的时间再次轮训查找。 默认值: 1000
backoffSleepIncrement = context.getLong(PollableSourceConstants.BACKOFF_SLEEP_INCREMENT,
PollableSourceConstants.DEFAULT_BACKOFF_SLEEP_INCREMENT);
// todo 当最后一次尝试没有找到任何新数据时,每次重新尝试轮询新数据之间的最大时间延迟 . 默认值: 5000
maxBackOffSleepInterval = context.getLong(PollableSourceConstants.MAX_BACKOFF_SLEEP,
PollableSourceConstants.DEFAULT_MAX_BACKOFF_SLEEP);
// todo 是否添加头部存储绝对路径 默认: false
fileHeader = context.getBoolean(FILENAME_HEADER, DEFAULT_FILE_HEADER);
// todo 当fileHeader为TURE时使用。 默认头文件信息 key : file
fileHeaderKey = context.getString(FILENAME_HEADER_KEY, DEFAULT_FILENAME_HEADER_KEY);
//todo 最大批次数量 Long.MAX_VALUE 2^63-1
maxBatchCount = context.getLong(MAX_BATCH_COUNT, DEFAULT_MAX_BATCH_COUNT);
if (maxBatchCount <= 0) {
maxBatchCount = DEFAULT_MAX_BATCH_COUNT;
logger.warn("Invalid maxBatchCount specified, initializing source "
+ "default maxBatchCount of {}", maxBatchCount);
}
if (sourceCounter == null) {
sourceCounter = new SourceCounter(getName());
}
}
初始化完成之后,调用 start 方法.
// todo: 创建初始化后的变量创建了 ReliableTaildirEventReader 对象,
// 并启动两个线程池,分别是监控日志文件,记录日志文件读取的偏移量
@Override
public synchronized void start() {
logger.info("{} TaildirSource source starting with directory: {}", getName(), filePaths);
try {
reader = new ReliableTaildirEventReader.Builder()
.filePaths(filePaths)
.headerTable(headerTable)
.positionFilePath(positionFilePath)
.skipToEnd(skipToEnd)
.addByteOffset(byteOffsetHeader)
.cachePatternMatching(cachePatternMatching)
.annotateFileName(fileHeader)
.fileNameHeader(fileHeaderKey)
.build();
} catch (IOException e) {
throw new FlumeException("Error instantiating ReliableTaildirEventReader", e);
}
// todo 创建线程池监控日志文件。
idleFileChecker = Executors.newSingleThreadScheduledExecutor(
new ThreadFactoryBuilder().setNameFormat("idleFileChecker").build());
//todo idleTimeout 默认值: 120000
//todo checkIdleInterval 默认值: 5000
idleFileChecker.scheduleWithFixedDelay(new idleFileCheckerRunnable(),
idleTimeout, checkIdleInterval, TimeUnit.MILLISECONDS);
// todo 创建线程池记录日志文件读取的偏移量。
// todo writePosInitDelay 默认值: 5000
// todo writePosInterval 默认值: 5000
// todo positionWriter主要作用是记录日志文件读取的偏移量,
// 以json格式("inode", inode, "pos", tf.getPos(), "file", tf.getPath()),
// 其中inode是linux系统中特有属性,在适应其他系统(Windows等)日志采集时ReliableTaildirEventReader.getInode()方法需要修改。
// pos则是记录的日志读取的偏移量,file记录了日志文件的路径
positionWriter = Executors.newSingleThreadScheduledExecutor(
new ThreadFactoryBuilder().setNameFormat("positionWriter").build());
positionWriter.scheduleWithFixedDelay(new PositionWriterRunnable(),
writePosInitDelay, writePosInterval, TimeUnit.MILLISECONDS);
super.start();
logger.debug("TaildirSource started");
sourceCounter.start();
}
在这里面构建 对象: ReliableTaildirEventReader
在 ReliableTaildirEventReader 里面 创建有三个地方需要重点关注一下
先说第一个地方, 创建TaildirMatcher , 个人理解是: 为了标识缓存查找文件用的工具类.
org.apache.flume.source.taildir.TaildirMatcher#TaildirMatcher
构造方法如下:
第二个地方,如果创建了新文件或者检测到附加到现有文件,则更新tailFiles映射 TailFiles。
org.apache.flume.source.taildir.TaildirSource#updateTailFiles
/**
* Update tailFiles mapping if a new file is created or appends are detected
* to the existing file.
*/
public List<Long> updateTailFiles(boolean skipToEnd) throws IOException {
updateTime = System.currentTimeMillis();
List<Long> updatedInodes = Lists.newArrayList();
//todo 获取缓存中的 taildir ,
//todo taildir对象内容: {filegroup='f1', filePattern='/todo/flume/taildir/input/data.log', cached=true}
for (TaildirMatcher taildir : taildirCache) {
//todo taildir : {filegroup='f1', filePattern='/todo/flume/taildir/input/data.log', cached=true}
//todo headerTable : {f1={headerKey1=markHeaderKey}}
Map<String, String> headers = headerTable.row(taildir.getFileGroup());
// todo 获取匹配文件,并将文件按最后修改时间进行排序
for (File f : taildir.getMatchingFiles()) {
long inode;
try {
//todo 获取文件的 inode 只支持 unix:ino 这里写死了
inode = getInode(f);
} catch (NoSuchFileException e) {
logger.info("File has been deleted in the meantime: " + e.getMessage());
continue;
}
TailFile tf = tailFiles.get(inode);
if (tf == null || !tf.getPath().equals(f.getAbsolutePath())) {
//todo , 缓存中没有,或者路径不一样. 就认为是新建的数据.
long startPos = skipToEnd ? f.length() : 0;
//todo 读取文件获取 操作对象实例 TailFile
tf = openFile(f, headers, inode, startPos);
} else {
//todo , 缓存中存在, 判断 更新文件修改最后修改日期, 文件的大小是否有过变动.
boolean updated = tf.getLastUpdated() < f.lastModified() || tf.getPos() != f.length();
if (updated) {
if (tf.getRaf() == null) {
tf = openFile(f, headers, inode, tf.getPos());
}
if (f.length() < tf.getPos()) {
logger.info("Pos " + tf.getPos() + " is larger than file size! "
+ "Restarting from pos 0, file: " + tf.getPath() + ", inode: " + inode);
tf.updatePos(tf.getPath(), inode, 0);
}
}
tf.setNeedTail(updated);
}
//todo 更新文件
tailFiles.put(inode, tf);
updatedInodes.add(inode);
}
}
return updatedInodes;
}
这里有几个地方需要说一下.
1. headerTable 的数据结构是 HashBasedTable<R, C, V> 其实就是 Map<R, Map<C, V>>
其中 R: rowKey , C : columnKey , V : value
2. 获取匹配到的文件,以最后修改时间进行排序.
3. 根据文件,获取文件对应的 inode , 这个代码里面写死了, 所以只支持 unix:inoTailFile
private long getInode(File file) throws IOException {
long inode = (long) Files.getAttribute(file.toPath(), "unix:ino");
return inode;
}
4. 并根据获取到的 inode 生成对应的实体对象
tf = openFile(f, headers, inode, startPos);
//todo 方法根据日志文件对象,headers,inode和偏移量pos创建一个TailFile对象
private TailFile openFile(File file, Map<String, String> headers, long inode, long pos) {
try {
logger.info("Opening file: " + file + ", inode: " + inode + ", pos: " + pos);
return new TailFile(file, headers, inode, pos);
} catch (IOException e) {
throw new FlumeException("Failed opening file: " + file, e);
}
}
在创建 TailFile 的时候, 有几个点要说明一下.
读取文件采用的是 RandomAccessFile 类 , 这个类可以用 seek 方法进行定位, 从而读取指定的数据. 定位是根据字节来的,而不是根据行.
这样就获取到了需要跟新的文件了.
我们再跳出来. 回落到
org.apache.flume.source.taildir.TaildirSource#ReliableTaildirEventReader 构造方法里面
加载位置文件.
org.apache.flume.source.taildir.TaildirSource#loadPositionFile
这个类是更新 TailFile 对象中的 文件指针: pos
当第一次加载的时候, 这个文件时空的,所以会跳过.
我们看一下里面的源码. 其实就是读取文件里面的 json 数据.
对文件进行定位. 涉及的参数: inode / pos / file
文件里面的内容:
[{"inode":12895973088,"pos":27,"file":"/todo/flume/taildir/input/data.log"}]
读取源码:
/**
* Load a position file which has the last read position of each file.
* If the position file exists, update tailFiles mapping.
*
* 加载具有每个文件的最后读取位置的位置文件。
* 如果位置文件存在,更新tailFiles映射。
*
*/
public void loadPositionFile(String filePath) {
Long inode, pos;
String path;
FileReader fr = null;
JsonReader jr = null;
try {
fr = new FileReader(filePath);
jr = new JsonReader(fr);
jr.beginArray();
while (jr.hasNext()) {
inode = null;
pos = null;
path = null;
jr.beginObject();
while (jr.hasNext()) {
switch (jr.nextName()) {
case "inode":
inode = jr.nextLong();
break;
case "pos":
pos = jr.nextLong();
break;
case "file":
path = jr.nextString();
break;
}
}
jr.endObject();
for (Object v : Arrays.asList(inode, pos, path)) {
Preconditions.checkNotNull(v, "Detected missing value in position file. "
+ "inode: " + inode + ", pos: " + pos + ", path: " + path);
}
TailFile tf = tailFiles.get(inode);
if (tf != null && tf.updatePos(path, inode, pos)) {
tailFiles.put(inode, tf);
} else {
logger.info("Missing file: " + path + ", inode: " + inode + ", pos: " + pos);
}
}
jr.endArray();
} catch (FileNotFoundException e) {
logger.info("File not found: " + filePath + ", not updating position");
} catch (IOException e) {
logger.error("Failed loading positionFile: " + filePath, e);
} finally {
try {
if (fr != null) fr.close();
if (jr != null) jr.close();
} catch (IOException e) {
logger.error("Error: " + e.getMessage(), e);
}
}
}
ReliableTaildirEventReader 类的初始化&创建就说了完了, 在 start 方法里面还有两个线程
// todo 创建线程池监控日志文件。
idleFileChecker = Executors.newSingleThreadScheduledExecutor(
new ThreadFactoryBuilder().setNameFormat("idleFileChecker").build());
//todo idleTimeout 默认值: 120000
//todo checkIdleInterval 默认值: 5000
idleFileChecker.scheduleWithFixedDelay(new idleFileCheckerRunnable(),
idleTimeout, checkIdleInterval, TimeUnit.MILLISECONDS);
唠叨两句, 记录文件读取位置的日志文件,是定时更新. 默认应该是 5 秒一次.
// todo 创建线程池记录日志文件读取的偏移量。
// todo writePosInitDelay 默认值: 5000
// todo writePosInterval 默认值: 5000
// todo positionWriter主要作用是记录日志文件读取的偏移量,
// 以json格式("inode", inode, "pos", tf.getPos(), "file", tf.getPath()),
// 其中inode是linux系统中特有属性,在适应其他系统(Windows等)日志采集时ReliableTaildirEventReader.getInode()方法需要修改。
// pos则是记录的日志读取的偏移量,file记录了日志文件的路径
positionWriter = Executors.newSingleThreadScheduledExecutor(
new ThreadFactoryBuilder().setNameFormat("positionWriter").build());
positionWriter.scheduleWithFixedDelay(new PositionWriterRunnable(),
writePosInitDelay, writePosInterval, TimeUnit.MILLISECONDS);
好了,到这里, TailDirSource 的启动工作就完成了.
接下来我们说代码执行.
TailDirSource 实现了 PollableSource 接口 , 在这个接口中定义了三个方法,直接看类图吧:
最重要的就三个方法:
getBackOffSleepIncrement: 当最后一次尝试没有找到任何新数据时,推迟变量长的时间再次轮训查找。 默认值: 1000
getMaxBackOffSleepInterval: 当最后一次尝试没有找到任何新数据时,每次重新尝试轮询新数据之间的最大时间延迟 . 默认值: 5000
process : 运行的代码.
TailDirSource 是封装为 PollingRunner 进行运行的. 不断循环调用 process 方法, 每次执行完 process 方法之后,然后休眠一定时间
话不多说,直接看 process 方法:
org.apache.flume.source.taildir.TaildirSource#process
@Override
public Status process() {
Status status = Status.BACKOFF;
try {
// todo 清空记录存在inode的list
existingInodes.clear();
// todo 调用ReliableTaildirEventReader对象的updateTailFiles方法获取要监控的日志文件。
existingInodes.addAll(reader.updateTailFiles());
for (long inode : existingInodes) {
// todo 获取具体tailFile对象
TailFile tf = reader.getTailFiles().get(inode);
// todo 是否需要tail
if (tf.needTail()) {
// todo 获取每个日志文件的更新数据,并发送,其中包括文件规则是否满足
boolean hasMoreLines = tailFileProcess(tf, true);
if (hasMoreLines) {
status = Status.READY;
}
}
}
closeTailFiles();
} catch (Throwable t) {
logger.error("Unable to tail files", t);
sourceCounter.incrementEventReadFail();
status = Status.BACKOFF;
}
return status;
}
在这里面,核心的方法是:
boolean hasMoreLines = tailFileProcess(tf, true);
batchSize 代表一个事务 有多少个 Event .
org.apache.flume.source.taildir.ReliableTaildirEventReader#readEvents
public List<Event> readEvents(int numEvents, boolean backoffWithoutNL)
throws IOException {
//todo 如果有未提交的任务,代表之前的任务失败了,进行回滚操作.
if (!committed) {
if (currentFile == null) {
throw new IllegalStateException("current file does not exist. " + currentFile.getPath());
}
logger.info("Last read was never committed - resetting position");
long lastPos = currentFile.getPos();
currentFile.updateFilePos(lastPos);
}
//todo 读取数据
List<Event> events = currentFile.readEvents(numEvents, backoffWithoutNL, addByteOffset);
if (events.isEmpty()) {
return events;
}
Map<String, String> headers = currentFile.getHeaders();
//todo 是否添加头部信息
if (annotateFileName || (headers != null && !headers.isEmpty())) {
for (Event event : events) {
if (headers != null && !headers.isEmpty()) {
event.getHeaders().putAll(headers);
}
if (annotateFileName) {
event.getHeaders().put(fileNameHeader, currentFile.getPath());
}
}
}
committed = false;
return events;
}
org.apache.flume.source.taildir.TaildirSource#readEvents
public List<Event> readEvents(int numEvents, boolean backoffWithoutNL,
boolean addByteOffset) throws IOException {
List<Event> events = Lists.newLinkedList();
for (int i = 0; i < numEvents; i++) {
//todo 读取数据 ,并转换为 Event
Event event = readEvent(backoffWithoutNL, addByteOffset);
if (event == null) {
break;
}
events.add(event);
}
return events;
}
org.apache.flume.source.taildir.TailFile#readEvent
private Event readEvent(boolean backoffWithoutNL, boolean addByteOffset) throws IOException {
Long posTmp = getLineReadPos();
//todo 按字节读取数据, 然后根据换行符 BYTE_NL = 10 , 截取一行数据, 返回二进制数据.
LineResult line = readLine();
if (line == null) {
return null;
}
//todo 过滤掉已经读取的数据.
if (backoffWithoutNL && !line.lineSepInclude) {
logger.info("Backing off in file without newline: "
+ path + ", inode: " + inode + ", pos: " + raf.getFilePointer());
updateFilePos(posTmp);
return null;
}
Event event = EventBuilder.withBody(line.line);
// todo 是否要增加偏移量
if (addByteOffset == true) {
event.getHeaders().put(BYTE_OFFSET_HEADER_KEY, posTmp.toString());
}
return event;
}
在这里说一下 evnet 数据结构吧.
Event 的实例是 SimpleEvent 包含 headers 和 body 两部分.
headers, 头信息, 是一个 HashMap 数据结构.
body , 数据内容, 是一个 byte 数据结构. 里面为一行数据.
到这里,我们已经拿到了文件数据, 是一个 event 的集合. 需要放到 channel 里面.
getChannelProcessor().processEventBatch(events);
调用的是
org.apache.flume.channel.ChannelProcessor.processEventBatch
嗯嗯,这个方法有点长, 其实就就根据 event 的不同, 将数据分发到不同的 channel 中. 然后等待 sink 进行消费.
/**
* Attempts to {@linkplain Channel#put(Event) put} the given events into each
* configured channel. If any {@code required} channel throws a
* {@link ChannelException}, that exception will be propagated.
* <p>
* <p>Note that if multiple channels are configured, some {@link Transaction}s
* may have already been committed while others may be rolled back in the
* case of an exception.
*
* @param events A list of events to put into the configured channels.
* @throws ChannelException when a write to a required channel fails.
*/
public void processEventBatch(List<Event> events) {
Preconditions.checkNotNull(events, "Event list must not be null");
events = interceptorChain.intercept(events);
Map<Channel, List<Event>> reqChannelQueue =
new LinkedHashMap<Channel, List<Event>>();
Map<Channel, List<Event>> optChannelQueue =
new LinkedHashMap<Channel, List<Event>>();
for (Event event : events) {
List<Channel> reqChannels = selector.getRequiredChannels(event);
for (Channel ch : reqChannels) {
List<Event> eventQueue = reqChannelQueue.get(ch);
if (eventQueue == null) {
eventQueue = new ArrayList<Event>();
reqChannelQueue.put(ch, eventQueue);
}
eventQueue.add(event);
}
List<Channel> optChannels = selector.getOptionalChannels(event);
for (Channel ch : optChannels) {
List<Event> eventQueue = optChannelQueue.get(ch);
if (eventQueue == null) {
eventQueue = new ArrayList<Event>();
optChannelQueue.put(ch, eventQueue);
}
eventQueue.add(event);
}
}
// Process required channels
for (Channel reqChannel : reqChannelQueue.keySet()) {
Transaction tx = reqChannel.getTransaction();
Preconditions.checkNotNull(tx, "Transaction object must not be null");
try {
tx.begin();
List<Event> batch = reqChannelQueue.get(reqChannel);
for (Event event : batch) {
reqChannel.put(event);
}
tx.commit();
} catch (Throwable t) {
tx.rollback();
if (t instanceof Error) {
LOG.error("Error while writing to required channel: " + reqChannel, t);
throw (Error) t;
} else if (t instanceof ChannelException) {
throw (ChannelException) t;
} else {
throw new ChannelException("Unable to put batch on required " +
"channel: " + reqChannel, t);
}
} finally {
if (tx != null) {
tx.close();
}
}
}
// Process optional channels
for (Channel optChannel : optChannelQueue.keySet()) {
Transaction tx = optChannel.getTransaction();
Preconditions.checkNotNull(tx, "Transaction object must not be null");
try {
tx.begin();
List<Event> batch = optChannelQueue.get(optChannel);
for (Event event : batch) {
optChannel.put(event);
}
tx.commit();
} catch (Throwable t) {
tx.rollback();
LOG.error("Unable to put batch on optional channel: " + optChannel, t);
if (t instanceof Error) {
throw (Error) t;
}
} finally {
if (tx != null) {
tx.close();
}
}
}
}