Octopus中的分布式事务
写在前面:Octopus中的分布式事务设计都在TxManager.cpp中,结合log,相当于把分布式事务的阶段分开实现,比如提交,准备等等,因此需要结合实际的例子来看,接下来我们看一个filesystem.cpp中的函数mknod的实现,具体分析Octopus中分布式事务的实现。
2pc(两阶段提交)
关于2pc事务的基础知识,网上的文章很多,不在这里详细阐述。直接看Octopus的实现。(只保留关键代码)
bool FileSystem::mknod2pc(const char *path)
{
if (path == NULL) {
return false; /* Fail due to null path. */
} else {
UniqueHash hashUnique;
HashTable::getUniqueHash(path, strlen(path), &hashUnique); /* Get unique hash. */
NodeHash hashNode = storage->getNodeHash(&hashUnique); /* Get node hash by unique hash. */
AddressHash hashAddress = HashTable::getAddressHash(&hashUnique); /* Get address hash by unique hash. */
uint64_t DistributedTxID;
uint64_t LocalTxID;
uint64_t RemoteTxID, srcBuffer, desBuffer, size, remotekey, offset;
if (checkLocal(hashNode) == true) { /* If local node. */
bool result;
uint64_t key = lockWriteHashItem(hashNode, hashAddress); /* Lock hash item. */
{
DistributedTxID = TxDistributedBegin();
LocalTxID = TxLocalBegin();
char *parent = (char *)malloc(strlen(path) + 1);
char *name = (char *)malloc(strlen(path) + 1);
getParentDirectory(path, parent);
getNameFromPath(path, name);
if (addMetaToDirectory(parent, name, false, &RemoteTxID, &srcBuffer, &desBuffer, &size, &remotekey, &offset) == false) {
TxDistributedPrepare(DistributedTxID, false);
result = false;
} else {
uint64_t indexMeta;
bool isDirectory;
if (storage->hashtable->get(&hashUnique, &indexMeta, &isDirectory) == true) { /* If path exists. */
TxDistributedPrepare(DistributedTxID, false);
result = false; /* Fail due to existence of path. */
} else {
uint64_t indexFileMeta;
FileMeta metaFile;
metaFile.timeLastModified = time(NULL); /* Set last modified time. */
metaFile.count = 0; /* Initialize count of extents as 0. */
metaFile.size = 0;
/* Apply updated data to local log. */
TxWriteData(LocalTxID, (uint64_t)&metaFile, (uint64_t)sizeof(FileMeta));
/* Receive remote prepare with (OK) */
TxDistributedPrepare(DistributedTxID, true);
/* Start phase 2, commit it. */
updateDirectoryMeta(parent, RemoteTxID, srcBuffer, desBuffer, size, remotekey, offset);
/* Only allocate momery, write to log first. */
if (storage->tableFileMeta->create(&indexFileMeta, &metaFile) == false) {
result = false; /* Fail due to create error. */
} else {
if (storage->hashtable->put(&hashUnique, indexFileMeta, false) == false) { /* false for file. */
result = false; /* Fail due to hash table put. No roll back. */
} else {
result = true;
}
}
}
}
free(parent);
free(name);
}
if (result == false) {
TxLocalCommit(LocalTxID, false);
TxDistributedCommit(DistributedTxID, false);
} else {
TxLocalCommit(LocalTxID, true);
TxDistributedCommit(DistributedTxID, true);
}
unlockWriteHashItem(key, hashNode, hashAddress); /* Unlock hash item. */
return result; /* Return specific result. */
} else { /* If remote node. */
return false;
}
}
}
这个函数主要做了这些事情:
首先得到node的hash和文件的hash,目的是把对应的文件锁住。并不是2pc的核心代码。
注意,下面的阶段说的都是分布式事务,local事务只涉及begin & commit:
开始阶段
接下来是事务的开始阶段。这个server上会开启一个本地的事务,一个分布式的事务,具体行为就是在本地的log和分布式的log中分别分配一个ID;(实际上我们发现这个server即时参与者,也是协调者)
下面是代码实现:
uint64_t TxManager::TxLocalBegin() {
lock_guard<mutex> lck (LocalMutex);
LocalLogEntry *log = (LocalLogEntry *)LocalLogAddress;
log[LocalLogIndex].TxID = LocalLogIndex;
log[LocalLogIndex].begin = true;
FlushData((uint64_t)&log[LocalLogIndex], CACHELINE_SIZE);
LocalLogIndex += 1;
return (LocalLogIndex - 1);
}
uint64_t TxManager::TxDistributedBegin() {
lock_guard<mutex> lck (DisMutex);
DistributedLogEntry *log = (DistributedLogEntry *)DistributedLogAddress;
log[DistributedLogIndex].TxID = DistributedLogIndex;
log[DistributedLogIndex].begin = true;
FlushData((uint64_t)&log[DistributedLogIndex], CACHELINE_SIZE);
DistributedLogIndex += 1;
return (DistributedLogIndex - 1);
}
void TxManager::FlushData(uint64_t address, uint64_t size) {
uint32_t i;
size = size + ((unsigned long)(address) & (CACHELINE_SIZE - 1));
for (i = 0; i < size; i += CACHELINE_SIZE) {
_mm_clflush((void *)(address + i));
}
}
可以看到,使用了一个FlushData
函数保证cache中的数据刷回到内存。保证了数据的一致性,而且log的更新都是1B的,在8B原子写范围内,因此这里的一致性是NVM性质保证的。
进入准备阶段
开始阶段之后,要进入到准备阶段。准备阶段的关键方法有两个:addMetaToDirectory
和给空inode分配内存。
因为我们拿mknod举例子,就意味着要创建一个空的inode,意味着一个空文件。所以需要根据path找到它的parent和它的文件名称,之后更新parent目录的metadata,具体表现是count加一,文件名数组中加上名字,但是还并没有真正为这个空文件的metadata分配内存,只是告诉它的parent directory有这么一个metadata。需要注意的是,这个操作也是一个事务(如果是本地,就是local事务,是远端就是remote事务),只有成功完成才能继续之后的操作。
(FileSystem::addMetaToDirectory
,只保留关键代码)
bool FileSystem::addMetaToDirectory(const char *path, const char *name, bool isDirectory,
uint64_t *TxID, uint64_t *srcBuffer, uint64_t *desBuffer, uint64_t *size, uint64_t *key, uint64_t *offset)
{
uint64_t LocalTxID;
if (checkLocal(hashNode) == true) { /* If local node. */
bool result;
// lock hashitem...(!! this lock will unlocked in updateDirectoryMeta() !!)
{
uint64_t indexDirectoryMeta; /* Meta index of directory. */
bool isDirectoryTemporary; /* Different from parameter isDirectory. */
if (isDirectoryTemporary == false) { /* If not a directory. */
result = false; /* Fail due to path is not directory. */
} else {
DirectoryMeta metaDirectory;
if (storage->tableDirectoryMeta->get(indexDirectoryMeta, &metaDirectory) == false) { /* Get directory meta. */
result = false; /* Fail due to get directory meta error. */
} else {
LocalTxID = TxLocalBegin();
metaDirectory.count++; /* Add count of names under directory. */
strcpy(metaDirectory.tuple[metaDirectory.count - 1].names, name); /* Add name. */
metaDirectory.tuple[metaDirectory.count - 1].isDirectories = isDirectory; /* Add directory state. */
TxWriteData(LocalTxID, (uint64_t)&metaDirectory, (uint64_t)sizeof(DirectoryMeta));
*srcBuffer = getTxWriteDataAddress(LocalTxID);
*size = (uint64_t)sizeof(DirectoryMeta);
*TxID = LocalTxID;
result = storage->tableDirectoryMeta->put(indexDirectoryMeta, &metaDirectory, desBuffer);
}
}
}
TxLocalCommit(LocalTxID, result);
return result; /* Return specific result. */
} else { /* If remote node. */
AddMetaToDirectorySendBuffer bufferAddMetaToDirectorySend; /* Send buffer. */
bufferAddMetaToDirectorySend.message = MESSAGE_ADDMETATODIRECTORY; /* Assign message type. */
strcpy(bufferAddMetaToDirectorySend.path, path); /* Assign path. */
strcpy(bufferAddMetaToDirectorySend.name, name); /* Assign name. */
bufferAddMetaToDirectorySend.isDirectory = isDirectory;
UpdataDirectoryMetaReceiveBuffer bufferGeneralReceive;
RdmaCall((uint16_t)hashNode,
(char *)&bufferAddMetaToDirectorySend,
(uint64_t)sizeof(AddMetaToDirectorySendBuffer),
(char *)&bufferGeneralReceive,
(uint64_t)sizeof(UpdataDirectoryMetaReceiveBuffer));
*srcBuffer = bufferGeneralReceive.srcBuffer;
*desBuffer = bufferGeneralReceive.desBuffer;
*TxID = bufferGeneralReceive.TxID;
*size = bufferGeneralReceive.size;
*key = bufferGeneralReceive.key;
*offset = bufferGeneralReceive.offset;
return bufferGeneralReceive.result;
}
}
如果addMetaToDirectory
失败,就意味着准备阶段失败,直接退出。
如果成功就要真正为这个空的metadata(inode)分配内存。分配完成之后,同样要通过TxWriteData
函数把这个新建的metaFile写到本地的log中,至此,我们认为准备阶段结束。
提交阶段
进入提交阶段,这个时候我们认为协调者server的数据已经准备好,此时mknod的新metadata已经分配完成,需要更新到DirectoryMeta中,所以updateDirectoryMeta
就完成了这个事:
bool FileSystem::updateDirectoryMeta(const char *path, uint64_t TxID, uint64_t srcBuffer,
uint64_t desBuffer, uint64_t size, uint64_t key, uint64_t offset) {
if (path == NULL) {
return false; /* Fail due to null path. */
} else {
UniqueHash hashUnique;
HashTable::getUniqueHash(path, strlen(path), &hashUnique); /* Get unique hash. */
NodeHash hashNode = storage->getNodeHash(&hashUnique); /* Get node hash by unique hash. */
if (checkLocal(hashNode) == true) {
bool result = true;
uint64_t indexDirectoryMeta;
bool isDirectory;
if (storage->hashtable->get(&hashUnique, &indexDirectoryMeta, &isDirectory) == false) { /* If path exists. */
result = false; /* Fail due to existence of path. */
} else {
result = true;
memcpy((void *)desBuffer, (void *)srcBuffer, size);
}
unlockWriteHashItem(key, hashNode, (AddressHash)offset); /* Unlock hash item. (locked in addMetaToDirectory())*/
TxLocalCommit(TxID, true);
return result;
} else {
DoRemoteCommitSendBuffer bufferSend;
strcpy(bufferSend.path, path);
bufferSend.message = MESSAGE_DOCOMMIT;
bufferSend.TxID = TxID;
bufferSend.srcBuffer = srcBuffer;
bufferSend.desBuffer = desBuffer;
bufferSend.size = size;
bufferSend.key = key;
bufferSend.offset = offset;
GeneralReceiveBuffer bufferReceive;
RdmaCall((uint16_t)hashNode,
(char *)&bufferSend,
(uint64_t)sizeof(DoRemoteCommitSendBuffer),
(char *)&bufferReceive,
(uint64_t)sizeof(GeneralReceiveBuffer));
return bufferReceive.result;
}
}
}
这段代码的基本思路比较简单,就是把desBuffer(DirectoryMeta对应index的item)赋值为srcBuffer(TxWriteData中对应TxID的logdata数据区)。(但是我不太懂为什么要TxLocalCommit(TxID, true)
,感觉应该是TxLocalCommit(TxID, result)
)
在此之后,我们需要在tableFileMeta中的相应位置写入metaFile。成功之后,我们认为commit阶段成功。
总结
分布式事务:开始阶段(分配TxID以及对应的log item)-> 准备阶段(1. parent directory的metadata更新;2. 为新的metadata分配内存)-> 3. 提交阶段(把新的metadata复制到DirectoryMeta对应index的位置)
2pc强调的实际上是准备和提交分开的两个阶段。
collect-dispatch事务设计
collect-dispatch事务(cd transaction)是Octopus中新提出的一种分布式事务,首先对其原理进行简要介绍。
传统的2pc会导致非常高的开销(分布式log,分布式锁,以及log持久性等),因此Octopus使用RDMA原语实现了cd事务,针对crash一致性以及并发控制。
对于crash一致性,cd事务的本地logging可以通过远端原地更新。在collect阶段,Octopus从参与者处收集读写操作,体现在协调者的本地事务以及本地logging中。因为参与者的log数据直接更新到远端,因此参与者本身不必保存logging数据,因此不需要参与者与协商者之间复杂的协商以及logging持久化之类的操作;而在dispatch阶段,协商者通过RDMA write将修改同步给其他的参与者,这一阶段并不需要参与者感知。
对于并发控制,使用GCC和RDMA locking结合的方式保证。在cd事务中,不管是参与者还是协商者,本地锁是通过GCC指令compare_and_swap
上锁的,本地解锁用GCCcompare_and_swap
,远端解锁通过RDMA compare_and_swap
,RDMA的解锁无需CPU参与,一定程度上简化了解锁过程。
同样的,我们看mknodcd
。
bool FileSystem::mknodcd(const char *path)
{
Debug::debugTitle("FileSystem::mknod");
Debug::debugItem("Stage 1. Entry point. Path: %s.", path);
if (path == NULL) {
return false; /* Fail due to null path. */
} else {
UniqueHash hashUnique;
HashTable::getUniqueHash(path, strlen(path), &hashUnique); /* Get unique hash. */
NodeHash hashNode = storage->getNodeHash(&hashUnique); /* Get node hash by unique hash. */
AddressHash hashAddress = HashTable::getAddressHash(&hashUnique); /* Get address hash by unique hash. */
// uint64_t DistributedTxID;
uint64_t LocalTxID;
if (checkLocal(hashNode) == true) { /* If local node. */
bool result;
uint64_t key = lockWriteHashItem(hashNode, hashAddress); /* Lock hash item. */
{
// DistributedTxID = TxDistributedBegin();
LocalTxID = TxLocalBegin();
Debug::debugItem("Stage 2. Update parent directory metadata.");
char *parent = (char *)malloc(strlen(path) + 1);
char *name = (char *)malloc(strlen(path) + 1);
DirectoryMeta parentMeta;
uint64_t parentHashAddress, parentMetaAddress;
uint16_t parentNodeID;
getParentDirectory(path, parent);
getNameFromPath(path, name);
if (readDirectoryMeta(parent, &parentMeta, &parentHashAddress, &parentMetaAddress, &parentNodeID) == false) {
Debug::notifyError("readDirectoryMeta failed.");
// TxDistributedPrepare(DistributedTxID, false);
result = false;
} else {
uint64_t indexMeta;
bool isDirectory;
if (storage->hashtable->get(&hashUnique, &indexMeta, &isDirectory) == true) { /* If path exists. */
Debug::notifyError("addMetaToDirectory failed.");
// TxDistributedPrepare(DistributedTxID, false);
result = false; /* Fail due to existence of path. */
} else {
/* Update directory meta first. */
parentMeta.count++; /* Add count of names under directory. */
strcpy(parentMeta.tuple[parentMeta.count - 1].names, name); /* Add name. */
parentMeta.tuple[parentMeta.count - 1].isDirectories = isDirectory; /* Add directory state. */
Debug::debugItem("Stage 3. Create file meta.");
uint64_t indexFileMeta;
FileMeta metaFile;
metaFile.timeLastModified = time(NULL); /* Set last modified time. */
metaFile.count = 0; /* Initialize count of extents as 0. */
metaFile.size = 0;
/* Apply updated data to local log. */
TxWriteData(LocalTxID, (uint64_t)&parentMeta, (uint64_t)sizeof(DirectoryMeta));
/* Receive remote prepare with (OK) */
// TxDistributedPrepare(DistributedTxID, true);
/* Start phase 2, commit it. */
updateRemoteMeta(parentNodeID, &parentMeta, parentMetaAddress, parentHashAddress);
/* Only allocate momery, write to log first. */
if (storage->tableFileMeta->create(&indexFileMeta, &metaFile) == false) {
result = false; /* Fail due to create error. */
} else {
if (storage->hashtable->put(&hashUnique, indexFileMeta, false) == false) { /* false for file. */
result = false; /* Fail due to hash table put. No roll back. */
} else {
result = true;
}
}
}
}
free(parent);
free(name);
}
TxLocalCommit(LocalTxID, result);
// TxDistributedCommit(DistributedTxID, result);
unlockWriteHashItem(key, hashNode, hashAddress); /* Unlock hash item. */
Debug::debugItem("Stage end.");
return result; /* Return specific result. */
} else { /* If remote node. */
return false;
}
}
}
有了上面对2pc的分析,可以看出实际上和mknod2pc
非常类似,一些不同的地方在于:
mknodcd
中不存在分布式事务(TxDistributedBegin()
等),都是本地事务;- 具体的操作函数不一样,在2pc中是
addMetaToDirectory
和updateDirectoryMeta
,而在mknodcd
中是readDirectoryMeta
和updateRemoteMeta
下面具体分析这两个不同的函数组合。
先来看readDirectoryMeta
:
bool FileSystem::readDirectoryMeta(const char *path, DirectoryMeta *meta, uint64_t *hashAddress, uint64_t *metaAddress, uint16_t *parentNodeID) {
Debug::debugTitle("FileSystem::readDirectoryMeta");
Debug::debugItem("Stage 1. Entry point. Path: %s.", path);
if (path == NULL) /* Judge if path and list buffer are valid. */
return false; /* Null parameter error. */
else {
bool result;
UniqueHash hashUnique;
HashTable::getUniqueHash(path, strlen(path), &hashUnique); /* Get unique hash. */
NodeHash hashNode = storage->getNodeHash(&hashUnique); /* Get node hash by unique hash. */
*hashAddress = HashTable::getAddressHash(&hashUnique); /* Get address hash by unique hash. */
*parentNodeID = (uint16_t)hashNode;
if (checkLocal(hashNode) == true) { /* If local node. */
uint64_t key = lockReadHashItem(hashNode, *hashAddress); /* Lock hash item. */
{
uint64_t indexDirectoryMeta;
bool isDirectory;
if (storage->hashtable->get(&hashUnique, &indexDirectoryMeta, &isDirectory) == false) { /* If path does not exist. */
Debug::notifyError("path does not exist");
result = false; /* Fail due to path does not exist. */
} else {
Debug::debugItem("Stage 2. Get meta.");
if (isDirectory == false) { /* If file meta. */
Debug::notifyError("Not a directory");
result = false; /* Fail due to not directory. */
} else {
if (storage->tableDirectoryMeta->get(indexDirectoryMeta, meta, metaAddress) == false) {
Debug::notifyError("Fail due to get directory meta error.");
result = false; /* Fail due to get directory meta error. */
} else {
Debug::debugItem("metaAddress = %lx, getDmfsBaseAddress = %lx", *metaAddress, server->getMemoryManagerInstance()->getDmfsBaseAddress());
*metaAddress = *metaAddress - server->getMemoryManagerInstance()->getDmfsBaseAddress();
result = true; /* Succeed. */
}
}
}
}
unlockReadHashItem(key, hashNode, *hashAddress); /* Unlock hash item. */
Debug::debugItem("Stage end.");
return result; /* Return specific result. */
} else { /* If remote node. */
GeneralSendBuffer bufferSend;
bufferSend.message = MESSAGE_READDIRECTORYMETA;
strcpy(bufferSend.path, path);
ReadDirectoryMetaReceiveBuffer bufferReceive;
RdmaCall((uint16_t)hashNode,
(char *)&bufferSend,
(uint64_t)sizeof(GeneralSendBuffer),
(char *)&bufferReceive,
(uint64_t)sizeof(ReadDirectoryMetaReceiveBuffer));
if (bufferReceive.result == false) {
Debug::notifyError("Remote Call readDirectoryMeta failed");
result = false;
} else {
memcpy((void *)meta, (void *)&(bufferReceive.meta), sizeof(DirectoryMeta));
*hashAddress = bufferReceive.hashAddress;
*metaAddress = bufferReceive.metaAddress;
*parentNodeID = bufferReceive.parentNodeID;
return bufferReceive.result;
}
return false;
}
}
}
与2pc不同的是2pc实际上是在参与者端(这里我们叫远端,管协调者叫本端)也维护了一个log数据,因此这样会导致一个远端数据的持久性问题,即本端需要等到远端log数据持久化之后才能prepare成功,实际上是有些麻烦的。
void FileSystem::updateRemoteMeta(uint16_t parentNodeID, DirectoryMeta *meta, uint64_t parentMetaAddress, uint64_t parentHashAddress) {
Debug::debugTitle("updateRemoteMeta");
/* Prepare imm data. */
uint32_t imm, temp;
/*
| 12b | 20b |
+-------+------------+
| 0XFFF | HashAdress |
+-------+------------+
*/
temp = 0XFFF;
imm = (temp << 20);
imm += (uint32_t)parentHashAddress;
/* Remote write with imm. */
uint64_t SendBuffer;
server->getMemoryManagerInstance()->getServerSendAddress(parentNodeID, &SendBuffer);
uint64_t RemoteBuffer = parentMetaAddress;
Debug::debugItem("imm = %x, SendBuffer = %lx, RemoteBuffer = %lx", imm, SendBuffer, RemoteBuffer);
if (parentNodeID == server->getRdmaSocketInstance()->getNodeID()) {
memcpy((void *)(RemoteBuffer + server->getMemoryManagerInstance()->getDmfsBaseAddress()), (void *)meta, sizeof(DirectoryMeta));
//unlockWriteHashItem(0, parentNodeID, parentHashAddress);
return;
}
uint64_t size = sizeof(DirectoryMeta) - sizeof(DirectoryMetaTuple) * (MAX_DIRECTORY_COUNT - meta->count);
memcpy((void *)SendBuffer, (void *)meta, size);
server->getRdmaSocketInstance()->RdmaWrite(parentNodeID, SendBuffer, RemoteBuffer, size, -1, 1);
server->getRdmaSocketInstance()->RdmaRead(parentNodeID, SendBuffer, RemoteBuffer, size, 1);
/* Data will be written to the remote address, and lock will be released with the assist of imm data. */
/* WRITE READ will be send after that, flushing remote data. */
}
总结
就mknod这个操作来说,协商者是发出这个动作的server,参与者是这个文件parent directory metadata所在的server。
如果这两个server是一个server,那么直接memcpy就可以,同时也不涉及复杂的分布式事务;如果这两个server属于不同的机器,那么我们先把远端的parent directory metadata读到本地的一个ReadDirectoryMetaReceiveBuffer
中(相当于collect阶段),之后updateRemoteMeta
的时候就是把ReadDirectoryMetaReceiveBuffer
中的数据发还(RDMA write)给远端机器即可,之后再用一个RDMA read保证远端IB NIC buffer中的数据刷回到内存。
其中collect体现在把远端的数据读到本地,而且远端不必存储本次事务的logging。相当于把logging的更改放到本地,而dispatch体现在修改完成之后再同步回去。(我记得之前说过看TxManager.cpp的源码里面啥都没有,感觉没实现cd事务在瞎吹,对不起,磕头了。。。)
接下来,我们对照论文中的图片把整个mknodcd的流程对应起来:
/*
* Coordinator: Participant: in function `FileSystem::mknodcd` :
* {begin}
* | wait
* [log begin] | TxLocalBegin()
* | local lock
* | |
* local lock collect writeset readDirectoryMeta() begin
* | |
* wait <----------------------------------|
* | | readDirectoryMeta() done: get parentMeta
* +-----------------------------+ | // parentMeta do something
* | Local transaction Execution | | // create new file's metadata
* +-----------------------------+ |
* | |
* [log context] |
* | |
* Write Data | TxWriteData()
* Local unlock |
* | ------update writeset-------------> | updateRemoteMeta()
* | ------remote unlock---------------> | then create an filemeta entry and put metaFile index in hashtable
* | |
* [log commit or abort] | TxLocalCommit()
* | |
* {end} {end}
*/