背景
需要读取HDFS上变化的日志文件,对每一行进行处理,就是类似于Linux中tail -f实现的功能。
看了看好像Spark和Flink都没有类似的支持,于是就用Flink自定义了Source实现了这个功能。
实现思路
维持一个当前读取位置的偏移量,然后每隔几秒去看下文件的大小是否大于当前偏移量。如果最新文件大小大于当前偏移量就读取数据,并将当前偏移量设置为最新的文件大小;反之,不做任何操作。
以下的代码,还没有把当前读取位置存储到状态中,如果重启会重头开始读。
实现代码
自定义Source
package com.upupfeng.source;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.fs.FSDataInputStream;
import org.apache.flink.core.fs.FileStatus;
import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.core.fs.Path;
import org.apache.flink.streaming.api.functions.source.RichSourceFunction;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
/**
* 自定义Source实现对HDFS上的文件进行"tail -f"的类似操作
* @author mawf
*/
public class TailHdfsFileSource extends RichSourceFunction<String> {
// 当前读取到的偏移量
private volatile Long currentPos = 0L;
// 运行flag
private volatile Boolean running;
// Flink HDFS FileSystem的配置
private Configuration configuration;
// 要监听的文件的目录
private String path;
// 每次轮询的间隔。秒
private Integer duration;
public TailHdfsFileSource(Configuration configuration, String path, Integer duration) {
this.configuration = configuration;
this.path = path;
this.duration = duration;
init();
}
// 初始化
private void init() {
running = true;
}
@Override
public void run(SourceContext<String> ctx) throws Exception {
FileSystem.initialize(configuration, null);
FileSystem fileSystem = FileSystem.get(FileSystem.getDefaultFsUri());
while (running) {
Long latestLength = getLatestLength(fileSystem);
if (latestLength > currentPos) {
collectRecords(ctx, fileSystem, latestLength);
}
Thread.sleep(duration * 1000);
}
}
// 收集记录
public void collectRecords(SourceContext<String> ctx, FileSystem fs, Long latestLength) throws IOException {
FSDataInputStream dataInputStream = fs.open(new Path(path));
// 移动InputStream的偏移量
dataInputStream.seek(currentPos);
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(dataInputStream));
String line;
while ((line = bufferedReader.readLine()) != null) {
ctx.collect(line);
}
// 更新偏移量
currentPos = latestLength;
}
// 获取最新的文件大小
public Long getLatestLength(FileSystem fs) throws IOException {
FileStatus fileStatus = fs.getFileStatus(new Path(path));
return fileStatus.getLen();
}
@Override
public void cancel() {
running = false;
}
}
使用自定义的Source
package com.upupfeng.demo;
import com.upupfeng.source.TailHdfsFileSource;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
/**
* @author mawf
*/
public class TailHdfsFileDemo {
public static void main(String[] args) throws Exception {
System.setProperty("user.name", "root");
// 创建配置对象
Configuration configuration = new Configuration();
configuration.setString("fs.default-scheme", "hdfs://hadoop1:8020");
String path = "/user/mwf/a.log";
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
TailHdfsFileSource tailHdfsFileSource = new TailHdfsFileSource(configuration, path, 5);
env.addSource(tailHdfsFileSource)
.setParallelism(1)
.print();
env.execute();
}
}