1.刷盘管理器之DefaultFlushManager
public class CommitLog implements Swappable {
class DefaultFlushManager implements FlushManager {
public DefaultFlushManager() {
MessageStoreConfig messageStoreConfig = CommitLog.this.defaultMessageStore.getMessageStoreConfig();
FlushDiskType flushDiskType = messageStoreConfig.getFlushDiskType();
if (FlushDiskType.SYNC_FLUSH == flushDiskType) {//同步刷盘策略
this.flushCommitLogService = new CommitLog.GroupCommitService();
} else {
this.flushCommitLogService = new CommitLog.FlushRealTimeService();//异步刷盘策略
}
this.commitRealTimeService = new CommitLog.CommitRealTimeService();// 显示开启异步堆外刷盘策略 暂存池刷盘方式
}
/**
* 启动强制刷盘策略线程 由父类DefaultFlushManager 触发具体 刷盘策略 线程执行
*/
@Override public void start() {
this.flushCommitLogService.start();
/**
* 是否开启堆外缓存刷盘策略 暂存池刷盘方式
*/
if (defaultMessageStore.isTransientStorePoolEnable()) {
this.commitRealTimeService.start();
}
}
// 触发刷盘策略
public CompletableFuture<PutMessageStatus> handleDiskFlush(AppendMessageResult result, MessageExt messageExt) {
MessageStoreConfig messageStoreConfig = CommitLog.this.defaultMessageStore.getMessageStoreConfig();
FlushDiskType flushDiskType = messageStoreConfig.getFlushDiskType();
// Synchronization flush
if (FlushDiskType.SYNC_FLUSH == flushDiskType) {
final GroupCommitService service = (GroupCommitService) this.flushCommitLogService;
// 再次确定客户端是否需要等待客户端刷盘后返回
if (messageExt.isWaitStoreMsgOK()) {//需要
long nextOffset = result.getWroteOffset() + result.getWroteBytes();
// 默认为 5秒
int syncFlushTimeout = messageStoreConfig.getSyncFlushTimeout();
GroupCommitRequest request = new GroupCommitRequest(nextOffset, syncFlushTimeout);
flushDiskWatcher.add(request);
service.putRequest(request);
return request.future();
} else {//不需要,则退化为异步刷盘
service.wakeup();
return CompletableFuture.completedFuture(PutMessageStatus.PUT_OK);
}
}
// Asynchronous flush
else {
if (!CommitLog.this.defaultMessageStore.isTransientStorePoolEnable()) {
flushCommitLogService.wakeup();// 异步刷盘方式
} else {
commitRealTimeService.wakeup();//暂存池刷盘方式
}
return CompletableFuture.completedFuture(PutMessageStatus.PUT_OK);
}
}
}
}
2.同步刷盘策略
同步刷盘服务为GroupCommitService。创建GroupCommitService对象时,将会初始化两个内部集合,分别是requestsWrite和RequestsRead,RequestsWrite
用于存放putRequest方法写入的刷盘请求,requestsRead用于存放doCommit方法读取的刷盘请求。使用两个队列实现读写分离,可以避免putRequest提交刷盘请求与doCommit消费刷盘请求之间的锁竞争。
另外,还会初始化一个独占锁,用于保证存入请求和交换请求操作的线程安全。
2.1.FlushDiskWatcher
public class FlushDiskWatcher extends ServiceThread {
private static final Logger log = LoggerFactory.getLogger(LoggerName.STORE_LOGGER_NAME);
private final LinkedBlockingQueue<GroupCommitRequest> commitRequests = new LinkedBlockingQueue<>();
@Override
public String getServiceName() {
return FlushDiskWatcher.class.getSimpleName();
}
@Override
public void run() {
while (!isStopped()) {
GroupCommitRequest request = null;
request = commitRequests.take();
while (!request.future().isDone()) {
long now = System.nanoTime();
if (now - request.getDeadLine() >= 0) {//超过5秒则刷盘超时
request.wakeupCustomer(PutMessageStatus.FLUSH_DISK_TIMEOUT);
break;
}
// To avoid frequent thread switching, replace future.get with sleep here,
long sleepTime = (request.getDeadLine() - now) / 1_000_000;
sleepTime = Math.min(10, sleepTime);
if (sleepTime == 0) {// 此时刚好达到5秒则刷盘超时
request.wakeupCustomer(PutMessageStatus.FLUSH_DISK_TIMEOUT);
break;
}
Thread.sleep(sleepTime);
}
}
}
public void add(GroupCommitRequest request) {
commitRequests.add(request);
}
public int queueSize() {
return commitRequests.size();
}
}
2.2.同步刷盘策略
class GroupCommitService extends FlushCommitLogService {
private volatile LinkedList<GroupCommitRequest> requestsWrite = new LinkedList<>();
private volatile LinkedList<GroupCommitRequest> requestsRead = new LinkedList<>();
private final PutMessageSpinLock lock = new PutMessageSpinLock();
public void putRequest(final GroupCommitRequest request) {
lock.lock();//自旋锁
try {
this.requestsWrite.add(request);
} finally {
lock.unlock();
}
this.wakeup();
}
/**
* 读写队列交换操作 & 写操作 通过锁建立了互斥性
* 1、交换过后的写容器requestsWrite其size为0。
* 2、刷盘线程刷盘期间也是写容器requestsWrite添加刷盘请求的过程。
* 3、读容器requestsRead其元素全部完成刷盘完成后,再次尝试获取锁从写容器requestsWrite交换刷盘请求元素。
*/
private void swapRequests() {
lock.lock();//自旋锁
try {
LinkedList<GroupCommitRequest> tmp = this.requestsWrite;
this.requestsWrite = this.requestsRead;
this.requestsRead = tmp;
} finally {
lock.unlock();
}
}
private void doCommit() {
if (!this.requestsRead.isEmpty()) {
for (GroupCommitRequest req : this.requestsRead) {
// There may be a message in the next file, so a maximum of
// two times the flush
boolean flushOK = CommitLog.this.mappedFileQueue.getFlushedWhere() >= req.getNextOffset();
for (int i = 0; i < 2 && !flushOK; i++) {
CommitLog.this.mappedFileQueue.flush(0);
flushOK = CommitLog.this.mappedFileQueue.getFlushedWhere() >= req.getNextOffset();
}
req.wakeupCustomer(flushOK ? PutMessageStatus.PUT_OK : PutMessageStatus.FLUSH_DISK_TIMEOUT);
}
long storeTimestamp = CommitLog.this.mappedFileQueue.getStoreTimestamp();
if (storeTimestamp > 0) {
CommitLog.this.defaultMessageStore.getStoreCheckpoint().setPhysicMsgTimestamp(storeTimestamp);
}
this.requestsRead = new LinkedList<>();
} else {
// Because of individual messages is set to not sync flush, it
// will come to this process
CommitLog.this.mappedFileQueue.flush(0);
}
}
/**
* 启动同步刷盘的独立线程
*/
@Override
public void run() {
while (!this.isStopped()) {
try {
this.waitForRunning(10);
this.doCommit();
} catch (Exception e) {
CommitLog.log.warn(this.getServiceName() + " service has exception. ", e);
}
}
// Under normal circumstances shutdown, wait for the arrival of the
// request, and then flush
try {
Thread.sleep(10);
} catch (InterruptedException e) {
CommitLog.log.warn("GroupCommitService Exception, ", e);
}
this.swapRequests();
this.doCommit();
CommitLog.log.info(this.getServiceName() + " service end");
}
@Override
protected void onWaitEnd() {
this.swapRequests();
}
@Override
public String getServiceName() {
if (CommitLog.this.defaultMessageStore.getBrokerConfig().isInBrokerContainer()) {
return CommitLog.this.defaultMessageStore.getBrokerConfig().getIdentifier() + GroupCommitService.class.getSimpleName();
}
return GroupCommitService.class.getSimpleName();
}
@Override
public long getJoinTime() {
return 1000 * 60 * 5;
}
}
public abstract class ServiceThread implements Runnable {
protected final CountDownLatch2 waitPoint = new CountDownLatch2(1);
protected volatile AtomicBoolean hasNotified = new AtomicBoolean(false);
/**
* 表明已经重放刷盘请求
*/
public void wakeup() {
if (hasNotified.compareAndSet(false, true)) {// 成立说明 刷盘线程等待 读写容器的交换
waitPoint.countDown(); // notify putRequest时通知线程
}
}
protected void waitForRunning(long interval) {//执行刷盘之前优先执行
if (hasNotified.compareAndSet(true, false)) {// 成立说明 写容器可以进一步交换
this.onWaitEnd();// 交换读写容器
return;
}
/**
* 以下执行表明一直没有 putRequest 操作。刷盘线程等待 interval ms,避免Cpu空轮转,浪费系统资源
*/
//entry to wait
waitPoint.reset();
try {
waitPoint.await(interval, TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
log.error("Interrupted", e);
} finally {
hasNotified.set(false);
this.onWaitEnd();
}
}
}
3.异步刷盘策略
class FlushRealTimeService extends FlushCommitLogService {
private long lastFlushTimestamp = 0;
private long printTimes = 0;
@Override
public void run() {
while (!this.isStopped()) {
boolean flushCommitLogTimed =
CommitLog.this.defaultMessageStore.getMessageStoreConfig().isFlushCommitLogTimed();
int interval = CommitLog.this.defaultMessageStore.getMessageStoreConfig().getFlushIntervalCommitLog();
//
int flushPhysicQueueLeastPages =
CommitLog.this.defaultMessageStore.getMessageStoreConfig().getFlushCommitLogLeastPages();
int flushPhysicQueueThoroughInterval =
CommitLog.this.defaultMessageStore.getMessageStoreConfig().getFlushCommitLogThoroughInterval();
boolean printFlushProgress = false;
// Print flush progress
long currentTimeMillis = System.currentTimeMillis();
if (currentTimeMillis >= (this.lastFlushTimestamp + flushPhysicQueueThoroughInterval)) {
this.lastFlushTimestamp = currentTimeMillis;
flushPhysicQueueLeastPages = 0;
printFlushProgress = (printTimes++ % 10) == 0;
}
try {
if (flushCommitLogTimed) {//true
Thread.sleep(interval);
} else {
this.waitForRunning(interval);//此处好像没有意义
}
...
long begin = System.currentTimeMillis();
// todo 从mappedFileQueue获取符合条件的MapperFile
CommitLog.this.mappedFileQueue.flush(flushPhysicQueueLeastPages);
long storeTimestamp = CommitLog.this.mappedFileQueue.getStoreTimestamp();
if (storeTimestamp > 0) {
CommitLog.this.defaultMessageStore.getStoreCheckpoint().setPhysicMsgTimestamp(storeTimestamp);
}
long past = System.currentTimeMillis() - begin;
CommitLog.this.getMessageStore().getPerfCounter().flowOnce("FLUSH_DATA_TIME_MS", (int) past);
} catch (Throwable e) {
CommitLog.log.warn(this.getServiceName() + " service has exception. ", e);
this.printFlushProgress();
}
}
// Normal shutdown, to ensure that all the flush before exit
boolean result = false;
for (int i = 0; i < RETRY_TIMES_OVER && !result; i++) {
result = CommitLog.this.mappedFileQueue.flush(0);
}
this.printFlushProgress();
CommitLog.log.info(this.getServiceName() + " service end");
}
}
4. FileChannel异步刷盘
RocketMQ实现数据的写是通过mmap方式零拷贝机制实现的。基于内存映射的零拷贝机制可能出现PageCache繁忙问题。通过开启TransientStorePoolEnable机制一定程度上可以缓解该问题的出现。
引入transientStorePoolEnable能缓解pagecache的压力背后关键如下:
- 消息先写入到堆外内存中,该内存由于启用了内存锁定机制,故消息的写入是接近直接操作内存,性能能得到保证。
- 消息进入到堆外内存后,后台会启动一个线程,一批一批将消息提交到pagecache,即写消息时对pagecache的写操作由单条写入变成了批量写入,降低了对pagecache的压力。
引入transientStorePoolEnable会增加数据丢失的可能性,如果Broker JVM进程异常退出,提交到PageCache中的消息是不会丢失的,但存在堆外内存(DirectByteBuffer)中但还未提交到PageCache中的这部分消息,将会丢失。但通常情况下,RocketMQ进程退出的可能性不大,通常情况下,如果启用了transientStorePoolEnable,消息发送端需要有重新推送机制(补偿思想)。
- 扩容:如果在开启了transientStorePoolEnable后,还会出现pagecache级别的繁忙,那需要集群进行扩容,或者对集群中的topic进行拆分,即将一部分topic迁移到其他集群中,降低集群的负载。
class CommitRealTimeService extends FlushCommitLogService {
private long lastCommitTimestamp = 0;
@Override
public void run() {
CommitLog.log.info(this.getServiceName() + " service started");
while (!this.isStopped()) {
int interval = CommitLog.this.defaultMessageStore.getMessageStoreConfig().getCommitIntervalCommitLog();
int commitDataLeastPages =
CommitLog.this.defaultMessageStore.getMessageStoreConfig().getCommitCommitLogLeastPages();
int commitDataThoroughInterval =
CommitLog.this.defaultMessageStore.getMessageStoreConfig().getCommitCommitLogThoroughInterval();
long begin = System.currentTimeMillis();
if (begin >= (this.lastCommitTimestamp + commitDataThoroughInterval)) {
this.lastCommitTimestamp = begin;
commitDataLeastPages = 0;
}
try {
// 将 mappedFile 中 利用 TransientStorePool 申请的堆外内存缓冲区之 writeBuffer 添加到FileChannel中
boolean result = CommitLog.this.mappedFileQueue.commit(commitDataLeastPages);
long end = System.currentTimeMillis();
if (!result) {
this.lastCommitTimestamp = end; // result = false means some data committed.
CommitLog.this.flushManager.wakeUpFlush();
}
CommitLog.this.getMessageStore().getPerfCounter().flowOnce("COMMIT_DATA_TIME_MS", (int) (end - begin));
this.waitForRunning(interval);
} catch (Throwable e) {
CommitLog.log.error(this.getServiceName() + " service has exception. ", e);
}
}
boolean result = false;
for (int i = 0; i < RETRY_TIMES_OVER && !result; i++) {
result = CommitLog.this.mappedFileQueue.commit(0);
}
CommitLog.log.info(this.getServiceName() + " service end");
}
}
最终刷盘是利用 FlushRealTimeService
完成对 FileChannel 的刷盘。
- QueueOffset
public class QueueOffsetAssigner {
public long assignQueueOffset(String topicQueueKey, short messageNum) {//messageNum:当前消息数。可能存在批量
// 初始化默认值
Long queueOffset = ConcurrentHashMapUtils.computeIfAbsent(this.topicQueueTable, topicQueueKey, k -> 0L);
this.topicQueueTable.put(topicQueueKey, queueOffset + messageNum);//累计当前topicQueueKey下的消息总数
return queueOffset;
}
}
topicQueueKey:是指当前topic & queue。
- WroteOffset
CommitLog目录下存在多个磁盘文件。每个磁盘文件对应一个MapperFile类,文件名表示 MapperFile类内部字节缓冲区byteBuffer的Capacity。
00000000000000000000 00000000000000010240
fileFromOffset可以通过当前MapperFile类对应的磁盘文件名得到。如果所示,如果是第一个文件则fileFromOffset为0,如果是第二个文件则fileFromOffset为10240,…。
int wroteOffset = fileFromOffset + byteBuffer.position()
byteBuffer是指当前MapperFile类持有的字节缓冲区。WroteOffset是指当前消息存在于CommitLog目录下第fileFromOffset / 10240
个文件中,其对应的起始偏移量为byteBuffer.position()。
- wrotePosition
wroteBytes:是指当前消息全部字节数。
WROTE_POSITION_UPDATER.addAndGet(this, wroteBytes);
即当前MapperFile类持有的字节缓冲区添加完当前消息后其postion的取值。其实也是下条数据写入的postion位置值。