一、WAL
WAL 的全称是 Write-Ahead Logging,中文称预写式日志,是一种数据安全写入机制,记录变更操作。就是先写日志,然后在写入磁盘,这样保证数据的安全性。WAL在关系型数据库中非常常见,Mysql中的Redo Log就是采用WAL机制。
二、Put
// org.apache.flume.channel.file.Log.java
FlumeEventPointer put(long transactionID, Event event)
throws IOException {
Preconditions.checkState(open, "Log is closed");
FlumeEvent flumeEvent = new FlumeEvent(
event.getHeaders(), event.getBody());
//封装Put操作,WAL日志会记录四种操作,分别是Put,Take,Commit和Rollback
//Put操作,全局写顺序ID加1
Put put = new Put(transactionID, WriteOrderOracle.next(), flumeEvent);
ByteBuffer buffer = TransactionEventRecord.toByteBuffer(put);
//选择数据目录的数据文件,比如log-1
int logFileIndex = nextLogWriter(transactionID);
long usableSpace = logFiles.get(logFileIndex).getUsableSpace();
long requiredSpace = minimumRequiredSpace + buffer.limit();
if (usableSpace <= requiredSpace) {
throw new IOException("Usable space exhausted, only " + usableSpace +
" bytes remaining, required " + requiredSpace + " bytes");
}
boolean error = true;
try {
try {
// Put事件写入WAL日志文件,Event也就持久化到文件了
// logFileIndex就是数据文件ID,比如log-1文件
// 需要注意的是,这里并不意味着数据立即被物理写入到磁盘上,数据被写到了操作系统缓冲区
// 最终由操作系统决定何时物理写入到磁盘。
// 如果你需要确保数据已经物理写入到磁盘,可以调用FileChannel的force()方法进行强刷
FlumeEventPointer ptr = logFiles.get(logFileIndex).put(buffer);
error = false;
return ptr;
} catch (LogFileRetryableIOException e) {
if (!open) {
throw e;
}
roll(logFileIndex, buffer);
FlumeEventPointer ptr = logFiles.get(logFileIndex).put(buffer);
error = false;
return ptr;
}
} finally {
if (error && open) {
roll(logFileIndex);
}
}
}
//org.apache.flume.channel.file.LogFile.Writer
synchronized FlumeEventPointer put(ByteBuffer buffer) throws IOException {
if (encryptor != null) {
buffer = ByteBuffer.wrap(encryptor.encrypt(buffer.array()));
}
//往fileChannel写入数据
Pair<Integer, Integer> pair = write(buffer);
return new FlumeEventPointer(pair.getLeft(), pair.getRight());
}
write(buffer)是一个公共方法,put、take、commit、rollback操作都通过该方法进行持久化。
private Pair<Integer, Integer> write(ByteBuffer buffer)
throws IOException {
if (!isOpen()) {
throw new LogFileRetryableIOException("File closed " + file);
}
long length = position();
long expectedLength = length + (long) buffer.limit();
//如果一个log写不下了,将会抛出LogFileRetryableIOException,外面捕获到这个异常会创建一个新的log并进行重写,log默认大小是1.6G
if (expectedLength > maxFileSize) {
throw new LogFileRetryableIOException(expectedLength + " > " +
maxFileSize);
}
int offset = (int) length;
Preconditions.checkState(offset >= 0, String.valueOf(offset));
// OP_RECORD + size + buffer
int recordLength = 1 + (int) Serialization.SIZE_OF_INT + buffer.limit();
usableSpace.decrement(recordLength);
preallocate(recordLength);
ByteBuffer toWrite = ByteBuffer.allocate(recordLength);
//写入代表WAL Record标识,占一个字节
toWrite.put(OP_RECORD);
writeDelimitedBuffer(toWrite, buffer);
toWrite.position(0);
//fileChannel写入数据,write()调用,但这时还没有物理持久化到磁盘
int wrote = getFileChannel().write(toWrite);
Preconditions.checkState(wrote == toWrite.limit());
return Pair.of(getLogFileID(), offset);
}
如果一个log写不下了,将会抛出LogFileRetryableIOException,外面捕获到这个异常会创建一个新的log并进行重写,log默认大小是1.6G。
protected static void writeDelimitedBuffer(ByteBuffer output, ByteBuffer buffer)
throws IOException {
//写入原始数据buffer的limit位,Int型,占4个字节
output.putInt(buffer.limit());
//写入原始数据buffer
output.put(buffer);
}
三、Take
void take(long transactionID, FlumeEventPointer pointer)
throws IOException {
Preconditions.checkState(open, "Log is closed");
//封装Take,全局写顺序ID加1
Take take = new Take(transactionID, WriteOrderOracle.next(),
pointer.getOffset(), pointer.getFileID());
ByteBuffer buffer = TransactionEventRecord.toByteBuffer(take);
int logFileIndex = nextLogWriter(transactionID);
long usableSpace = logFiles.get(logFileIndex).getUsableSpace();
long requiredSpace = minimumRequiredSpace + buffer.limit();
if (usableSpace <= requiredSpace) {
throw new IOException("Usable space exhausted, only " + usableSpace +
" bytes remaining, required " + requiredSpace + " bytes");
}
boolean error = true;
try {
try {
//往fileChannel写入数据,调用公共方法,后续和put一样了
logFiles.get(logFileIndex).take(buffer);
error = false;
} catch (LogFileRetryableIOException e) {
if (!open) {
throw e;
}
roll(logFileIndex, buffer);
logFiles.get(logFileIndex).take(buffer);
error = false;
}
} finally {
if (error && open) {
roll(logFileIndex);
}
}
}
四、Commit
//org.apache.flume.channel.file.Log
//传入事务ID,以及type用于区分take还put的提交
private void commit(long transactionID, short type) throws IOException {
Preconditions.checkState(open, "Log is closed");
//封装Commit,全局写顺序ID加1
Commit commit = new Commit(transactionID, WriteOrderOracle.next(), type);
ByteBuffer buffer = TransactionEventRecord.toByteBuffer(commit);
int logFileIndex = nextLogWriter(transactionID);
long usableSpace = logFiles.get(logFileIndex).getUsableSpace();
long requiredSpace = minimumRequiredSpace + buffer.limit();
if (usableSpace <= requiredSpace) {
throw new IOException("Usable space exhausted, only " + usableSpace +
" bytes remaining, required " + requiredSpace + " bytes");
}
boolean error = true;
try {
try {
LogFile.Writer logFileWriter = logFiles.get(logFileIndex);
// If multiple transactions are committing at the same time,
// this ensures that the number of actual fsyncs is small and a
// number of them are grouped together into one.
//往fileChannel写入数据
logFileWriter.commit(buffer);
//每次提交事务需要强制刷数据到磁盘
logFileWriter.sync();
error = false;
} catch (LogFileRetryableIOException e) {
if (!open) {
throw e;
}
roll(logFileIndex, buffer);
LogFile.Writer logFileWriter = logFiles.get(logFileIndex);
logFileWriter.commit(buffer);
logFileWriter.sync();
error = false;
}
} finally {
if (error && open) {
roll(logFileIndex);
}
}
}
synchronized void commit(ByteBuffer buffer) throws IOException {
if (encryptor != null) {
buffer = ByteBuffer.wrap(encryptor.encrypt(buffer.array()));
}
//公共方法
write(buffer);
dirty = true;
lastCommitPosition = position();
}
//org.apache.flume.channel.file.LogFile.Writer
synchronized void sync() throws IOException {
if (!fsyncPerTransaction && !dirty) {
if (LOG.isDebugEnabled()) {
LOG.debug(
"No events written to file, " + getFile().toString() +
" in last " + fsyncInterval + " or since last commit.");
}
return;
}
if (!isOpen()) {
throw new LogFileRetryableIOException("File closed " + file);
}
if (lastSyncPosition < lastCommitPosition) {
//强制刷数据到物理磁盘
getFileChannel().force(false);
lastSyncPosition = position();
syncCount++;
dirty = false;
}
}
需要注意的是,除了每次提交事务的时候进行一次fsync(),还可以配置定时fsync():
//org.apache.flume.channel.file.LogFile.Writer
Writer(File file, int logFileID, long maxFileSize,
CipherProvider.Encryptor encryptor, long usableSpaceRefreshInterval,
boolean fsyncPerTransaction, int fsyncInterval) throws IOException {
this.file = file;
this.logFileID = logFileID;
this.maxFileSize = Math.min(maxFileSize,
FileChannelConfiguration.DEFAULT_MAX_FILE_SIZE);
this.encryptor = encryptor;
writeFileHandle = new RandomAccessFile(file, "rw");
writeFileChannel = writeFileHandle.getChannel();
this.fsyncPerTransaction = fsyncPerTransaction;
this.fsyncInterval = fsyncInterval;
//判读每个事务都fsync(),这也是一个用户配置项
if (!fsyncPerTransaction) {
LOG.info("Sync interval = " + fsyncInterval);
syncExecutor = Executors.newSingleThreadScheduledExecutor();
syncExecutor.scheduleWithFixedDelay(new Runnable() {
@Override
public void run() {
try {
//定时fsync(),默认是5秒
sync();
} catch (Throwable ex) {
LOG.error("Data file, " + getFile().toString() + " could not " +
"be synced to disk due to an error.", ex);
}
}
}, fsyncInterval, fsyncInterval, TimeUnit.SECONDS);
} else {
syncExecutor = null;
}
usableSpace = new CachedFSUsableSpace(file, usableSpaceRefreshInterval);
LOG.info("Opened " + file);
open = true;
}
五、Rollback
void rollback(long transactionID) throws IOException {
Preconditions.checkState(open, "Log is closed");
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("Rolling back " + transactionID);
}
//封装Rollback,全局写顺序ID加1
Rollback rollback = new Rollback(transactionID, WriteOrderOracle.next());
ByteBuffer buffer = TransactionEventRecord.toByteBuffer(rollback);
int logFileIndex = nextLogWriter(transactionID);
long usableSpace = logFiles.get(logFileIndex).getUsableSpace();
long requiredSpace = minimumRequiredSpace + buffer.limit();
if (usableSpace <= requiredSpace) {
throw new IOException("Usable space exhausted, only " + usableSpace +
" bytes remaining, required " + requiredSpace + " bytes");
}
boolean error = true;
try {
try {
//和上面一样
logFiles.get(logFileIndex).rollback(buffer);
error = false;
} catch (LogFileRetryableIOException e) {
if (!open) {
throw e;
}
roll(logFileIndex, buffer);
logFiles.get(logFileIndex).rollback(buffer);
error = false;
}
} finally {
if (error && open) {
roll(logFileIndex);
}
}
}
六、读取WAL
public LogRecord next() throws IOException, CorruptEventException {
int offset = -1;
try {
//replay的时候读取log-x.meta文件获取上一次checkpoint时log-x文件的读写position
//前面已经设置跳到这个position了,从检查点的位置读取log-x
long position = fileChannel.position();
if (position > FileChannelConfiguration.DEFAULT_MAX_FILE_SIZE) {
LOG.info("File position exceeds the threshold: "
+ FileChannelConfiguration.DEFAULT_MAX_FILE_SIZE
+ ", position: " + position);
}
offset = (int) position;
Preconditions.checkState(offset >= 0);
while (offset < fileHandle.length()) {
//读取一个字节获取标识
byte operation = fileHandle.readByte();
//如果标识是OP_RECORD,表示WAL Record
if (operation == OP_RECORD) {
break;
} else if (operation == OP_EOF) {
LOG.info("Encountered EOF at " + offset + " in " + file);
return null;
} else if (operation == OP_NOOP) {
LOG.info("No op event found in file: " + file.toString() +
" at " + offset + ". Skipping event.");
skipRecord(fileHandle, offset + 1);
offset = (int) fileHandle.getFilePointer();
continue;
} else {
LOG.error("Encountered non op-record at " + offset + " " +
Integer.toHexString(operation) + " in " + file);
return null;
}
}
if (offset >= fileHandle.length()) {
return null;
}
//标识是OP_RECORD,继续读取数据
return doNext(offset);
} catch (EOFException e) {
return null;
} catch (IOException e) {
throw new IOException("Unable to read next Transaction from log file " +
file.getCanonicalPath() + " at offset " + offset, e);
}
}
LogRecord doNext(int offset) throws IOException, CorruptEventException,
DecryptionFailureException {
byte[] buffer = null;
TransactionEventRecord event = null;
try {
//把数据读到buffer
buffer = readDelimitedBuffer(getFileHandle());
if (decryptor != null) {
buffer = decryptor.decrypt(buffer);
}
event = TransactionEventRecord.fromByteArray(buffer);
} catch (CorruptEventException ex) {
LOGGER.warn("Corrupt file found. File id: log-" + this.getLogFileID(),
ex);
// Return null so that replay handler thinks all events in this file
// have been taken.
if (!fsyncPerTransaction) {
return null;
}
throw ex;
} catch (DecryptionFailureException ex) {
if (!fsyncPerTransaction) {
LOGGER.warn("Could not decrypt even read from channel. Skipping " +
"event.", ex);
return null;
}
throw ex;
}
return new LogRecord(getLogFileID(), offset, event);
}
protected static byte[] readDelimitedBuffer(RandomAccessFile fileHandle)
throws IOException, CorruptEventException {
//读取Int,也就是4个字节,表示data buffer size
int length = fileHandle.readInt();
if (length < 0) {
throw new CorruptEventException("Length of event is: " + String.valueOf(length) +
". Event must have length >= 0. Possible corruption of data or partial fsync.");
}
byte[] buffer = new byte[length];
try {
//将data读到buffer
fileHandle.readFully(buffer);
} catch (EOFException ex) {
throw new CorruptEventException("Remaining data in file less than " +
"expected size of event.", ex);
}
return buffer;
}
七、讨论
值得一提的是,当source往channel put一批数据,如果事务没有提交,这时候断电了,或者crash了,数据还没来得及fsync到磁盘,那么是有丢数的风险的。