flink在1.8版本,我们使用了
package org.apache.flink.streaming.connectors.fs.bucketing;
遇到了坑:
flink在写文件的时候,正在写的文件是:
.in-progress结尾的;
如果写完了,会有一个
.pending的状态
,会等checkpoint之后,才会将文件_开头的文件 正式转为非_开头的数据。
hive在读数据的时候,不会读取_开头的数据!
所以非常偶尔会有数据缺失的bug,比如我们任务merge任务已经开始执行了!,但是读到的数据
恰好.pending的状态, 所以map reduce 读到的文件数量会少一点。
最终解决:
判断hdfs文件写完,需要不是inprocess 和pending状态的,才行。
下面分析一下flink源码:
注意这几个常量:
initializeState:被引用的:
public void initializeState(FunctionInitializationContext context) throws Exception {
// 校验参数
Preconditions.checkArgument(this.restoredBucketStates == null, "The operator has already been initialized.");
try {
// 初始化文件系统
this.initFileSystem();
} catch (IOException var6) {
LOG.error("Error while creating FileSystem when initializing the state of the BucketingSink.", var6);
throw new RuntimeException("Error while creating FileSystem when initializing the state of the BucketingSink.", var6);
}
if (this.refTruncate == null) {
this.refTruncate = this.reflectTruncate(this.fs);
}
OperatorStateStore stateStore = context.getOperatorStateStore();
this.restoredBucketStates = stateStore.getSerializableListState("bucket-states");
int subtaskIndex = this.getRuntimeContext().getIndexOfThisSubtask();
if (context.isRestored()) {
LOG.info("Restoring state for the {} (taskIdx={}).", this.getClass().getSimpleName(), subtaskIndex);
Iterator var4 = ((Iterable)this.restoredBucketStates.get()).iterator();
while(var4.hasNext()) {
BucketingSink.State<T> recoveredState = (BucketingSink.State)var4.next();
// 主要看这个
this.handleRestoredBucketState(recoveredState);
if (LOG.isDebugEnabled()) {
LOG.debug("{} idx {} restored {}", new Object[]{this.getClass().getSimpleName(), subtaskIndex, recoveredState});
}
}
} else {
LOG.info("No state to restore for the {} (taskIdx={}).", this.getClass().getSimpleName(), subtaskIndex);
}
}
private void initFileSystem() throws IOException {
if (this.fs == null) {
Path path = new Path(this.basePath);
this.fs = createHadoopFileSystem(path, this.fsConfig);
}
}
private void handleRestoredBucketState(BucketingSink.State<T> restoredState) {
Preconditions.checkNotNull(restoredState);
Iterator var2 = restoredState.bucketStates.values().iterator();
while(var2.hasNext()) {
BucketingSink.BucketState<T> bucketState = (BucketingSink.BucketState)var2.next();
// 把所有的pending文件弄出来
bucketState.pendingFiles.clear();
this.handlePendingInProgressFile(bucketState.currentFile, bucketState.currentFileValidLength);
bucketState.currentFile = null;
bucketState.currentFileValidLength = -1L;
bucketState.isWriterOpen = false;
this.handlePendingFilesForPreviousCheckpoints(bucketState.pendingFilesPerCheckpoint);
bucketState.pendingFilesPerCheckpoint.clear();
}
}
private void handlePendingInProgressFile(String file, long validLength) {
if (file != null) {
Path partPath = new Path(file);
try {
Path partPendingPath = this.getPendingPathFor(partPath);
Path partInProgressPath = this.getInProgressPathFor(partPath);
if (this.fs.exists(partPendingPath)) {
LOG.debug("In-progress file {} has been moved to pending after checkpoint, moving to final location.", partPath);
this.fs.rename(partPendingPath, partPath);
} else if (this.fs.exists(partInProgressPath)) {
LOG.debug("In-progress file {} is still in-progress, moving to final location.", partPath);
this.fs.rename(partInProgressPath, partPath);
} else if (this.fs.exists(partPath)) {
LOG.debug("In-Progress file {} was already moved to final location {}.", file, partPath);
} else {
LOG.debug("In-Progress file {} was neither moved to pending nor is still in progress. Possibly, it was moved to final location by a previous snapshot restore", file);
}
if (this.refTruncate == null) {
this.refTruncate = this.reflectTruncate(this.fs);
}
if (this.refTruncate != null) {
LOG.debug("Truncating {} to valid length {}", partPath, validLength);
if (this.fs instanceof DistributedFileSystem) {
DistributedFileSystem dfs = (DistributedFileSystem)this.fs;
LOG.debug("Trying to recover file lease {}", partPath);
dfs.recoverLease(partPath);
boolean isclosed = dfs.isFileClosed(partPath);
StopWatch sw = new StopWatch();
sw.start();
for(; !isclosed && sw.getTime() <= this.asyncTimeout; isclosed = dfs.isFileClosed(partPath)) {
try {
Thread.sleep(500L);
} catch (InterruptedException var25) {
}
}
}
Boolean truncated = (Boolean)this.refTruncate.invoke(this.fs, partPath, validLength);
if (!truncated) {
LOG.debug("Truncate did not immediately complete for {}, waiting...", partPath);
StopWatch sw = new StopWatch();
sw.start();
long newLen;
for(newLen = this.fs.getFileStatus(partPath).getLen(); newLen != validLength && sw.getTime() <= this.asyncTimeout; newLen = this.fs.getFileStatus(partPath).getLen()) {
try {
Thread.sleep(500L);
} catch (InterruptedException var24) {
}
}
if (newLen != validLength) {
throw new RuntimeException("Truncate did not truncate to right length. Should be " + validLength + " is " + newLen + ".");
}
}
} else {
Path validLengthFilePath = this.getValidLengthPathFor(partPath);
if (!this.fs.exists(validLengthFilePath) && this.fs.exists(partPath)) {
LOG.debug("Writing valid-length file for {} to specify valid length {}", partPath, validLength);
FSDataOutputStream lengthFileOut = this.fs.create(validLengthFilePath);
Throwable var34 = null;
try {
lengthFileOut.writeUTF(Long.toString(validLength));
} catch (Throwable var23) {
var34 = var23;
throw var23;
} finally {
if (lengthFileOut != null) {
if (var34 != null) {
try {
lengthFileOut.close();
} catch (Throwable var22) {
var34.addSuppressed(var22);
}
} else {
lengthFileOut.close();
}
}
}
}
}
} catch (IOException var27) {
LOG.error("Error while restoring BucketingSink state.", var27);
throw new RuntimeException("Error while restoring BucketingSink state.", var27);
} catch (IllegalAccessException | InvocationTargetException var28) {
LOG.error("Could not invoke truncate.", var28);
throw new RuntimeException("Could not invoke truncate.", var28);
}
}
}
从反编译的文件中,基本上还是可以看出来这个BucketingSink的一个过程的。