#define REPL_MAX_WRITTEN_BEFORE_FSYNC (1024*1024*8) /* 8 MB */
void readSyncBulkPayload(connection *conn) {
char buf[PROTO_IOBUF_LEN];
ssize_t nread, readlen, nwritten;
int use_diskless_load = useDisklessLoad();
redisDb *diskless_load_tempDb = NULL;
functionsLibCtx* temp_functions_lib_ctx = NULL;
int empty_db_flags = server.repl_slave_lazy_flush ? EMPTYDB_ASYNC :
EMPTYDB_NO_FLAGS;
off_t left;
/* Static vars used to hold the EOF mark, and the last bytes received
* from the server: when they match, we reached the end of the transfer. */
static char eofmark[CONFIG_RUN_ID_SIZE];
static char lastbytes[CONFIG_RUN_ID_SIZE];
static int usemark = 0;
/* If repl_transfer_size == -1 we still have to read the bulk length
* from the master reply. */
if (server.repl_transfer_size == -1) {
nread = connSyncReadLine(conn,buf,1024,server.repl_syncio_timeout*1000);
if (nread == -1) {
serverLog(LL_WARNING,
"I/O error reading bulk count from MASTER: %s",
connGetLastError(conn));
goto error;
} else {
/* nread here is returned by connSyncReadLine(), which calls syncReadLine() and
* convert "\r\n" to '\0' so 1 byte is lost. */
atomicIncr(server.stat_net_repl_input_bytes, nread+1);
}
if (buf[0] == '-') {
serverLog(LL_WARNING,
"MASTER aborted replication with an error: %s",
buf+1);
goto error;
} else if (buf[0] == '\0') {
/* At this stage just a newline works as a PING in order to take
* the connection live. So we refresh our last interaction
* timestamp. */
server.repl_transfer_lastio = server.unixtime;
return;
} else if (buf[0] != '$') {
serverLog(LL_WARNING,"Bad protocol from MASTER, the first byte is not '$' (we received '%s'), are you sure the host and port are right?", buf);
goto error;
}
/* There are two possible forms for the bulk payload. One is the
* usual $<count> bulk format. The other is used for diskless transfers
* when the master does not know beforehand the size of the file to
* transfer. In the latter case, the following format is used:
*
* $EOF:<40 bytes delimiter>
*
* At the end of the file the announced delimiter is transmitted. The
* delimiter is long and random enough that the probability of a
* collision with the actual file content can be ignored. */
if (strncmp(buf+1,"EOF:",4) == 0 && strlen(buf+5) >= CONFIG_RUN_ID_SIZE) {
usemark = 1;
memcpy(eofmark,buf+5,CONFIG_RUN_ID_SIZE);
memset(lastbytes,0,CONFIG_RUN_ID_SIZE);
/*
memcpy(eofmark,buf+5,CONFIG_RUN_ID_SIZE);:将一段内存空间(即从 buf + 5 开始的位置至接下来的
CONFIG_RUN_ID_SIZE 个字节)内容复制到 eofmark 中,其中 eofmark 是一个全局变量,用于保存 “EOF:” 后面的复制 ID。这行代码实现了将复制 ID 缓存在 eofmark 变量中,以供后续使用。
memset(lastbytes,0,CONFIG_RUN_ID_SIZE);:将一段内存空间(即从 lastbytes 开始的位置至接下来的 CONFIG_RUN_ID_SIZE 个字节)内容全部置为0。其中lastbytes 变量表示用于在特定情况下缓存部分已经
接收的数据,在接收完所有数据后会对其加以处理。这行代码实现了对其内容进行清空,防止影响后续的同步操作。
这是为了避免接收到新的数据时,将缓存在lastbytes数组中的旧数据错误地用于数据同步。在接收到新数据之前,需要将缓存区的内容清空,这样才能保证接收到的数据是正确的。
在Redis主从同步的过程中,lastbytes数组用于缓存已经接收到的除"EOF:"标识外的数据部分。在数据传输
过程中,如果传输的数据部分大小超过了从节点设置的最大缓存空间,Redis将按照传输的数据块所指定的最
小窗口大小,循环接收数据,多余的数据将被缓存到lastbytes数组中。在接收所有的数据块之后,
lastbytes中缓存的是最后一个块中被截断的多出来的部分。
接下来,Redis从节点会将lastbytes数组中缓存的数据与接收到的最后一个数据块进行合并,形成完整的数
据,并开始进行数据存储或加载。这个数据块的大小不一定能够被整除,因此需要在lastbytes下一次接收
到数据块之前将lastbytes数组清空,防止上一个数据块中的残留数据对数据的正确性造成影响。
因此,在这个场景中,对lastbytes进行清空,即初始化为0,可以避免数据传输过程中遗留数据对数据同步
过程造成的影响,确保从节点接收到的所有数据都是正确的。
*/
/* Set any repl_transfer_size to avoid entering this code path
* at the next call. */
server.repl_transfer_size = 0;
serverLog(LL_NOTICE,
"MASTER <-> REPLICA sync: receiving streamed RDB from master with EOF %s",
use_diskless_load? "to parser":"to disk");
} else {
usemark = 0;
server.repl_transfer_size = strtol(buf+1,NULL,10);
serverLog(LL_NOTICE,
"MASTER <-> REPLICA sync: receiving %lld bytes from master %s",
(long long) server.repl_transfer_size,
use_diskless_load? "to parser":"to disk");
}
return;
}
if (!use_diskless_load) {
/* Read the data from the socket, store it to a file and search
* for the EOF. */
if (usemark) {
readlen = sizeof(buf);
} else {
/*
如果usemark为假,说明当前正在进行增量复制,那么本次从主节点读取的数据长度
不能超过尚未复制的总数据量left
*/
left = server.repl_transfer_size - server.repl_transfer_read;
readlen = (left < (signed)sizeof(buf)) ? left : (signed)sizeof(buf);
}
nread = connRead(conn,buf,readlen);
if (nread <= 0) {
if (connGetState(conn) == CONN_STATE_CONNECTED) {
/* equivalent to EAGAIN */
return;
}
serverLog(LL_WARNING,"I/O error trying to sync with MASTER: %s",
(nread == -1) ? connGetLastError(conn) : "connection lost");
cancelReplicationHandshake(1);
return;
}
atomicIncr(server.stat_net_repl_input_bytes, nread);
/* When a mark is used, we want to detect EOF asap in order to avoid
* writing the EOF mark into the file... */
int eof_reached = 0;
/*
如果处于RDB文件接收模式,usemark标志为1,该段代码将处理当前读取的数据,并检查是否到达了RDB文件
的末尾。具体来说,这个段落将更新之前保存的lastbytes数组,并检查是否与eofmark值(即标记接收RDB
完成的标志)匹配。匹配上即说明接收到RDB文件末尾的标志,那么此时应该退出RDB文件传输模式,并重新
设定repl_transfer_size的值,以应对后续新的同步传输。具体如下:
检查当前读取到的数据是否至少有一个完整的CONFIG_RUN_ID_SIZE长度,这个长度是在读取EOF标记时记录
的,也是从当前的buf中获取的。如果当前读取长度不够一个完整的CONFIG_RUN_ID_SIZE,则需要将上次读
取的的后CONFIG_RUN_ID_SIZE-nread部分缓存下来,并与当前读取到的部分拼接出一个完整的
CONFIG_RUN_ID_SIZE长度。这样可以避免先前读取的数据被丢失。
拼接之后,用memcmp()比较lastbytes数组与eofmark数组中连续的CONFIG_RUN_ID_SIZE字节是否相同;如
果是,则设置eof_reached标志为1,表示已经接收到了RDB文件的最后标志,RDB文件传输过程结束。
如果没有处于RDB文件接收模式,即usemark标志为0,则不需要进行处理,程序忽略这段代码并继续向下执行。
*/
if (usemark) {
/* Update the last bytes array, and check if it matches our
* delimiter. */
if (nread >= CONFIG_RUN_ID_SIZE) {
memcpy(lastbytes,buf+nread-CONFIG_RUN_ID_SIZE,
CONFIG_RUN_ID_SIZE);
} else {
int rem = CONFIG_RUN_ID_SIZE-nread;
memmove(lastbytes,lastbytes+nread,rem);
memcpy(lastbytes+rem,buf,nread);
}
if (memcmp(lastbytes,eofmark,CONFIG_RUN_ID_SIZE) == 0)
eof_reached = 1;
}
/* Update the last I/O time for the replication transfer (used in
* order to detect timeouts during replication), and write what we
* got from the socket to the dump file on disk. */
server.repl_transfer_lastio = server.unixtime;
if ((nwritten = write(server.repl_transfer_fd,buf,nread)) != nread) {
serverLog(LL_WARNING,
"Write error or short write writing to the DB dump file "
"needed for MASTER <-> REPLICA synchronization: %s",
(nwritten == -1) ? strerror(errno) : "short write");
goto error;
}
server.repl_transfer_read += nread;
/* Delete the last 40 bytes from the file if we reached EOF. */
if (usemark && eof_reached) {
if (ftruncate(server.repl_transfer_fd,
server.repl_transfer_read - CONFIG_RUN_ID_SIZE) == -1)
{
serverLog(LL_WARNING,
"Error truncating the RDB file received from the master "
"for SYNC: %s", strerror(errno));
goto error;
}
}
/* Sync data on disk from time to time, otherwise at the end of the
* transfer we may suffer a big delay as the memory buffers are copied
* into the actual disk. */
if (server.repl_transfer_read >=
server.repl_transfer_last_fsync_off + REPL_MAX_WRITTEN_BEFORE_FSYNC)
{
off_t sync_size = server.repl_transfer_read -
server.repl_transfer_last_fsync_off;
rdb_fsync_range(server.repl_transfer_fd,
server.repl_transfer_last_fsync_off, sync_size);
server.repl_transfer_last_fsync_off += sync_size;
}
/* Check if the transfer is now complete */
if (!usemark) {
if (server.repl_transfer_read == server.repl_transfer_size)
eof_reached = 1;
}
/* If the transfer is yet not complete, we need to read more, so
* return ASAP and wait for the handler to be called again. */
if (!eof_reached) return;
}
/* We reach this point in one of the following cases:
*
* 1. The replica is using diskless replication, that is, it reads data
* directly from the socket to the Redis memory, without using
* a temporary RDB file on disk. In that case we just block and
* read everything from the socket.
*
* 2. Or when we are done reading from the socket to the RDB file, in
* such case we want just to read the RDB file in memory. */
/* We need to stop any AOF rewriting child before flushing and parsing
* the RDB, otherwise we'll create a copy-on-write disaster. */
if (server.aof_state != AOF_OFF) stopAppendOnly();
/* Also try to stop save RDB child before flushing and parsing the RDB:
* 1. Ensure background save doesn't overwrite synced data after being loaded.
* 2. Avoid copy-on-write disaster. */
if (server.child_type == CHILD_TYPE_RDB) {
if (!use_diskless_load) {
serverLog(LL_NOTICE,
"Replica is about to load the RDB file received from the "
"master, but there is a pending RDB child running. "
"Killing process %ld and removing its temp file to avoid "
"any race",
(long) server.child_pid);
}
killRDBChild();
}
if (use_diskless_load && server.repl_diskless_load == REPL_DISKLESS_LOAD_SWAPDB) {
/* Initialize empty tempDb dictionaries. */
diskless_load_tempDb = disklessLoadInitTempDb();
temp_functions_lib_ctx = functionsLibCtxCreate();
moduleFireServerEvent(REDISMODULE_EVENT_REPL_ASYNC_LOAD,
REDISMODULE_SUBEVENT_REPL_ASYNC_LOAD_STARTED,
NULL);
} else {
replicationAttachToNewMaster();
serverLog(LL_NOTICE, "MASTER <-> REPLICA sync: Flushing old data");
emptyData(-1,empty_db_flags,replicationEmptyDbCallback);
}
/* Before loading the DB into memory we need to delete the readable
* handler, otherwise it will get called recursively since
* rdbLoad() will call the event loop to process events from time to
* time for non blocking loading. */
connSetReadHandler(conn, NULL);
serverLog(LL_NOTICE, "MASTER <-> REPLICA sync: Loading DB in memory");
rdbSaveInfo rsi = RDB_SAVE_INFO_INIT;
if (use_diskless_load) {
rio rdb;
redisDb *dbarray;
functionsLibCtx* functions_lib_ctx;
int asyncLoading = 0;
if (server.repl_diskless_load == REPL_DISKLESS_LOAD_SWAPDB) {
/* Async loading means we continue serving read commands during full resync, and
* "swap" the new db with the old db only when loading is done.
* It is enabled only on SWAPDB diskless replication when master replication ID hasn't changed,
* because in that state the old content of the db represents a different point in time of the same
* data set we're currently receiving from the master. */
if (memcmp(server.replid, server.master_replid, CONFIG_RUN_ID_SIZE) == 0) {
asyncLoading = 1;
}
dbarray = diskless_load_tempDb;
functions_lib_ctx = temp_functions_lib_ctx;
} else {
dbarray = server.db;
functions_lib_ctx = functionsLibCtxGetCurrent();
functionsLibCtxClear(functions_lib_ctx);
}
rioInitWithConn(&rdb,conn,server.repl_transfer_size);
/* Put the socket in blocking mode to simplify RDB transfer.
* We'll restore it when the RDB is received. */
connBlock(conn);
connRecvTimeout(conn, server.repl_timeout*1000);
startLoading(server.repl_transfer_size, RDBFLAGS_REPLICATION, asyncLoading);
int loadingFailed = 0;
rdbLoadingCtx loadingCtx = { .dbarray = dbarray, .functions_lib_ctx = functions_lib_ctx };
if (rdbLoadRioWithLoadingCtx(&rdb,RDBFLAGS_REPLICATION,&rsi,&loadingCtx) != C_OK) {
/* RDB loading failed. */
serverLog(LL_WARNING,
"Failed trying to load the MASTER synchronization DB "
"from socket, check server logs.");
loadingFailed = 1;
} else if (usemark) {
/* Verify the end mark is correct. */
if (!rioRead(&rdb, buf, CONFIG_RUN_ID_SIZE) ||
memcmp(buf, eofmark, CONFIG_RUN_ID_SIZE) != 0)
{
serverLog(LL_WARNING, "Replication stream EOF marker is broken");
loadingFailed = 1;
}
}
if (loadingFailed) {
stopLoading(0);
cancelReplicationHandshake(1);
rioFreeConn(&rdb, NULL);
if (server.repl_diskless_load == REPL_DISKLESS_LOAD_SWAPDB) {
/* Discard potentially partially loaded tempDb. */
moduleFireServerEvent(REDISMODULE_EVENT_REPL_ASYNC_LOAD,
REDISMODULE_SUBEVENT_REPL_ASYNC_LOAD_ABORTED,
NULL);
disklessLoadDiscardTempDb(diskless_load_tempDb);
functionsLibCtxFree(temp_functions_lib_ctx);
serverLog(LL_NOTICE, "MASTER <-> REPLICA sync: Discarding temporary DB in background");
} else {
/* Remove the half-loaded data in case we started with an empty replica. */
emptyData(-1,empty_db_flags,replicationEmptyDbCallback);
}
/* Note that there's no point in restarting the AOF on SYNC
* failure, it'll be restarted when sync succeeds or the replica
* gets promoted. */
return;
}
/* RDB loading succeeded if we reach this point. */
if (server.repl_diskless_load == REPL_DISKLESS_LOAD_SWAPDB) {
/* We will soon swap main db with tempDb and replicas will start
* to apply data from new master, we must discard the cached
* master structure and force resync of sub-replicas. */
replicationAttachToNewMaster();
serverLog(LL_NOTICE, "MASTER <-> REPLICA sync: Swapping active DB with loaded DB");
swapMainDbWithTempDb(diskless_load_tempDb);
/* swap existing functions ctx with the temporary one */
functionsLibCtxSwapWithCurrent(temp_functions_lib_ctx);
moduleFireServerEvent(REDISMODULE_EVENT_REPL_ASYNC_LOAD,
REDISMODULE_SUBEVENT_REPL_ASYNC_LOAD_COMPLETED,
NULL);
/* Delete the old db as it's useless now. */
disklessLoadDiscardTempDb(diskless_load_tempDb);
serverLog(LL_NOTICE, "MASTER <-> REPLICA sync: Discarding old DB in background");
}
/* Inform about db change, as replication was diskless and didn't cause a save. */
server.dirty++;
stopLoading(1);
/* Cleanup and restore the socket to the original state to continue
* with the normal replication. */
rioFreeConn(&rdb, NULL);
connNonBlock(conn);
connRecvTimeout(conn,0);
} else {
/* Make sure the new file (also used for persistence) is fully synced
* (not covered by earlier calls to rdb_fsync_range). */
if (fsync(server.repl_transfer_fd) == -1) {
serverLog(LL_WARNING,
"Failed trying to sync the temp DB to disk in "
"MASTER <-> REPLICA synchronization: %s",
strerror(errno));
cancelReplicationHandshake(1);
return;
}
/* Rename rdb like renaming rewrite aof asynchronously. */
int old_rdb_fd = open(server.rdb_filename,O_RDONLY|O_NONBLOCK);
if (rename(server.repl_transfer_tmpfile,server.rdb_filename) == -1) {
serverLog(LL_WARNING,
"Failed trying to rename the temp DB into %s in "
"MASTER <-> REPLICA synchronization: %s",
server.rdb_filename, strerror(errno));
cancelReplicationHandshake(1);
if (old_rdb_fd != -1) close(old_rdb_fd);
return;
}
/* Close old rdb asynchronously. */
if (old_rdb_fd != -1) bioCreateCloseJob(old_rdb_fd, 0, 0);
/* Sync the directory to ensure rename is persisted */
if (fsyncFileDir(server.rdb_filename) == -1) {
serverLog(LL_WARNING,
"Failed trying to sync DB directory %s in "
"MASTER <-> REPLICA synchronization: %s",
server.rdb_filename, strerror(errno));
cancelReplicationHandshake(1);
return;
}
if (rdbLoad(server.rdb_filename,&rsi,RDBFLAGS_REPLICATION) != RDB_OK) {
serverLog(LL_WARNING,
"Failed trying to load the MASTER synchronization "
"DB from disk, check server logs.");
cancelReplicationHandshake(1);
if (server.rdb_del_sync_files && allPersistenceDisabled()) {
serverLog(LL_NOTICE,"Removing the RDB file obtained from "
"the master. This replica has persistence "
"disabled");
bg_unlink(server.rdb_filename);
}
/* Note that there's no point in restarting the AOF on sync failure,
it'll be restarted when sync succeeds or replica promoted. */
return;
}
/* Cleanup. */
if (server.rdb_del_sync_files && allPersistenceDisabled()) {
serverLog(LL_NOTICE,"Removing the RDB file obtained from "
"the master. This replica has persistence "
"disabled");
bg_unlink(server.rdb_filename);
}
zfree(server.repl_transfer_tmpfile);
server.repl_transfer_fd = -1;
server.repl_transfer_tmpfile = NULL;
}
/* Final setup of the connected slave <- master link */
replicationCreateMasterClient(server.repl_transfer_s,rsi.repl_stream_db);
server.repl_state = REPL_STATE_CONNECTED;
server.repl_down_since = 0;
/* Fire the master link modules event. */
moduleFireServerEvent(REDISMODULE_EVENT_MASTER_LINK_CHANGE,
REDISMODULE_SUBEVENT_MASTER_LINK_UP,
NULL);
/* After a full resynchronization we use the replication ID and
* offset of the master. The secondary ID / offset are cleared since
* we are starting a new history. */
memcpy(server.replid,server.master->replid,sizeof(server.replid));
server.master_repl_offset = server.master->reploff;
clearReplicationId2();
/* Let's create the replication backlog if needed. Slaves need to
* accumulate the backlog regardless of the fact they have sub-slaves
* or not, in order to behave correctly if they are promoted to
* masters after a failover. */
if (server.repl_backlog == NULL) createReplicationBacklog();
serverLog(LL_NOTICE, "MASTER <-> REPLICA sync: Finished with success");
if (server.supervised_mode == SUPERVISED_SYSTEMD) {
redisCommunicateSystemd("STATUS=MASTER <-> REPLICA sync: Finished with success. Ready to accept connections in read-write mode.\n");
}
/* Send the initial ACK immediately to put this replica in online state. */
if (usemark) replicationSendAck();
/* Restart the AOF subsystem now that we finished the sync. This
* will trigger an AOF rewrite, and when done will start appending
* to the new file. */
if (server.aof_enabled) restartAOFAfterSYNC();
return;
error:
cancelReplicationHandshake(1);
return;
}
【无标题】
于 2023-06-08 21:34:48 首次发布