set
get
过期删除
惰性删除
每次读取的时候检查是否过期,如果过期,则删除该键。
/*
* 检查 key 是否已经过期,如果是的话,将它从数据库中删除。
*
* 返回 0 表示键没有过期时间,或者键未过期。
*
* 返回 1 表示键已经因为过期而被删除了。
*/
int expireIfNeeded(redisDb *db, robj *key) {
// 取出键的过期时间
mstime_t when = getExpire(db,key);
mstime_t now;
// 没有过期时间
if (when < 0) return 0; /* No expire for this key */
/* Don't expire anything while loading. It will be done later. */
// 如果服务器正在进行载入,那么不进行任何过期检查
if (server.loading) return 0;
/* If we are in the context of a Lua script, we claim that time is
* blocked to when the Lua script started. This way a key can expire
* only the first time it is accessed and not in the middle of the
* script execution, making propagation to slaves / AOF consistent.
* See issue #1525 on Github for more information. */
now = server.lua_caller ? server.lua_time_start : mstime();
/* If we are running in the context of a slave, return ASAP:
* the slave key expiration is controlled by the master that will
* send us synthesized DEL operations for expired keys.
*
* Still we try to return the right information to the caller,
* that is, 0 if we think the key should be still valid, 1 if
* we think the key is expired at this time. */
// 当服务器运行在 replication 模式时
// 附属节点并不主动删除 key
// 它只返回一个逻辑上正确的返回值
// 真正的删除操作要等待主节点发来删除命令时才执行
// 从而保证数据的同步
if (server.masterhost != NULL) return now > when;
// 运行到这里,表示键带有过期时间,并且服务器为主节点
/* Return when this key has not expired */
// 如果未过期,返回 0
if (now <= when) return 0;
/* Delete the key */
server.stat_expiredkeys++;
// 向 AOF 文件和附属节点传播过期信息
propagateExpire(db,key);
// 发送事件通知
notifyKeyspaceEvent(REDIS_NOTIFY_EXPIRED,
"expired",key,db->id);
// 将过期键从数据库中删除
return dbDelete(db,key);
}
/* Delete a key, value, and associated expiration entry if any, from the DB
*
* 从数据库中删除给定的键,键的值,以及键的过期时间。
*
* 删除成功返回 1 ,因为键不存在而导致删除失败时,返回 0 。
*/
int dbDelete(redisDb *db, robj *key) {
/* Deleting an entry from the expires dict will not free the sds of
* the key, because it is shared with the main dictionary. */
// 删除键的过期时间
if (dictSize(db->expires) > 0) dictDelete(db->expires,key->ptr);
// 删除键值对
if (dictDelete(db->dict,key->ptr) == DICT_OK) {
// 如果开启了集群模式,那么从槽中删除给定的键
if (server.cluster_enabled) slotToKeyDel(key);
return 1;
} else {
// 键不存在
return 0;
}
}
定期删除
每隔一段时间,对数据库进行一次检查,删除里面的过期键。
void activeExpireCycle(int type) {
/* This function has some global state in order to continue the work
* incrementally across calls. */
// 静态变量,用来累积函数连续执行时的数据
static unsigned int current_db = 0; /* Last DB tested. */
static int timelimit_exit = 0; /* Time limit hit in previous call? */
static long long last_fast_cycle = 0; /* When last fast cycle ran. */
unsigned int j, iteration = 0;
// 默认每次处理的数据库数量
unsigned int dbs_per_call = REDIS_DBCRON_DBS_PER_CALL;
// 函数开始的时间
long long start = ustime(), timelimit;
// 快速模式
if (type == ACTIVE_EXPIRE_CYCLE_FAST) {
/* Don't start a fast cycle if the previous cycle did not exited
* for time limt. Also don't repeat a fast cycle for the same period
* as the fast cycle total duration itself. */
// 如果上次函数没有触发 timelimit_exit ,那么不执行处理
if (!timelimit_exit) return;
// 如果距离上次执行未够一定时间,那么不执行处理
if (start < last_fast_cycle + ACTIVE_EXPIRE_CYCLE_FAST_DURATION*2) return;
// 运行到这里,说明执行快速处理,记录当前时间
last_fast_cycle = start;
}
/* We usually should test REDIS_DBCRON_DBS_PER_CALL per iteration, with
* two exceptions:
*
* 一般情况下,函数只处理 REDIS_DBCRON_DBS_PER_CALL 个数据库,
* 除非:
*
* 1) Don't test more DBs than we have.
* 当前数据库的数量小于 REDIS_DBCRON_DBS_PER_CALL
* 2) If last time we hit the time limit, we want to scan all DBs
* in this iteration, as there is work to do in some DB and we don't want
* expired keys to use memory for too much time.
* 如果上次处理遇到了时间上限,那么这次需要对所有数据库进行扫描,
* 这可以避免过多的过期键占用空间
*/
if (dbs_per_call > server.dbnum || timelimit_exit)
dbs_per_call = server.dbnum;
/* We can use at max ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC percentage of CPU time
* per iteration. Since this function gets called with a frequency of
* server.hz times per second, the following is the max amount of
* microseconds we can spend in this function. */
// 函数处理的微秒时间上限
// ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC 默认为 25 ,也即是 25 % 的 CPU 时间
timelimit = 1000000*ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC/server.hz/100;
timelimit_exit = 0;
if (timelimit <= 0) timelimit = 1;
// 如果是运行在快速模式之下
// 那么最多只能运行 FAST_DURATION 微秒
// 默认值为 1000 (微秒)
if (type == ACTIVE_EXPIRE_CYCLE_FAST)
timelimit = ACTIVE_EXPIRE_CYCLE_FAST_DURATION; /* in microseconds. */
// 遍历数据库
for (j = 0; j < dbs_per_call; j++) {
int expired;
// 指向要处理的数据库
redisDb *db = server.db+(current_db % server.dbnum);
/* Increment the DB now so we are sure if we run out of time
* in the current DB we'll restart from the next. This allows to
* distribute the time evenly across DBs. */
// 为 DB 计数器加一,如果进入 do 循环之后因为超时而跳出
// 那么下次会直接从下个 DB 开始处理
current_db++;
/* Continue to expire if at the end of the cycle more than 25%
* of the keys were expired. */
do {
unsigned long num, slots;
long long now, ttl_sum;
int ttl_samples;
/* If there is nothing to expire try next DB ASAP. */
// 获取数据库中带过期时间的键的数量
// 如果该数量为 0 ,直接跳过这个数据库
if ((num = dictSize(db->expires)) == 0) {
db->avg_ttl = 0;
break;
}
// 获取数据库中键值对的数量
slots = dictSlots(db->expires);
// 当前时间
now = mstime();
/* When there are less than 1% filled slots getting random
* keys is expensive, so stop here waiting for better times...
* The dictionary will be resized asap. */
// 这个数据库的使用率低于 1% ,扫描起来太费力了(大部分都会 MISS)
// 跳过,等待字典收缩程序运行
if (num && slots > DICT_HT_INITIAL_SIZE &&
(num*100/slots < 1)) break;
/* The main collection cycle. Sample random keys among keys
* with an expire set, checking for expired ones.
*
* 样本计数器
*/
// 已处理过期键计数器
expired = 0;
// 键的总 TTL 计数器
ttl_sum = 0;
// 总共处理的键计数器
ttl_samples = 0;
// 每次最多只能检查 LOOKUPS_PER_LOOP 个键
if (num > ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP)
num = ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP;
// 开始遍历数据库
while (num--) {
dictEntry *de;
long long ttl;
// 从 expires 中随机取出一个带过期时间的键
if ((de = dictGetRandomKey(db->expires)) == NULL) break;
// 计算 TTL
ttl = dictGetSignedIntegerVal(de)-now;
// 如果键已经过期,那么删除它,并将 expired 计数器增一
if (activeExpireCycleTryExpire(db,de,now)) expired++;
if (ttl < 0) ttl = 0;
// 累积键的 TTL
ttl_sum += ttl;
// 累积处理键的个数
ttl_samples++;
}
/* Update the average TTL stats for this database. */
// 为这个数据库更新平均 TTL 统计数据
if (ttl_samples) {
// 计算当前平均值
long long avg_ttl = ttl_sum/ttl_samples;
// 如果这是第一次设置数据库平均 TTL ,那么进行初始化
if (db->avg_ttl == 0) db->avg_ttl = avg_ttl;
/* Smooth the value averaging with the previous one. */
// 取数据库的上次平均 TTL 和今次平均 TTL 的平均值
db->avg_ttl = (db->avg_ttl+avg_ttl)/2;
}
/* We can't block forever here even if there are many keys to
* expire. So after a given amount of milliseconds return to the
* caller waiting for the other active expire cycle. */
// 我们不能用太长时间处理过期键,
// 所以这个函数执行一定时间之后就要返回
// 更新遍历次数
iteration++;
// 每遍历 16 次执行一次
if ((iteration & 0xf) == 0 && /* check once every 16 iterations. */
(ustime()-start) > timelimit)
{
// 如果遍历次数正好是 16 的倍数
// 并且遍历的时间超过了 timelimit
// 那么断开 timelimit_exit
timelimit_exit = 1;
}
// 已经超时了,返回
if (timelimit_exit) return;
/* We don't repeat the cycle if there are less than 25% of keys
* found expired in the current DB. */
// 如果已删除的过期键占当前总数据库带过期时间的键数量的 25 %
// 那么不再遍历
} while (expired > ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP/4);
}
}
多路复用
/*
* 根据 mask 参数的值,监听 fd 文件的状态,
* 当 fd 可用时,执行 proc 函数
*/
int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask,
aeFileProc *proc, void *clientData)
{
if (fd >= eventLoop->setsize) {
errno = ERANGE;
return AE_ERR;
}
if (fd >= eventLoop->setsize) return AE_ERR;
// 取出文件事件结构
aeFileEvent *fe = &eventLoop->events[fd];
// 监听指定 fd 的指定事件
if (aeApiAddEvent(eventLoop, fd, mask) == -1)
return AE_ERR;
// 设置文件事件类型,以及事件的处理器
fe->mask |= mask;
if (mask & AE_READABLE) fe->rfileProc = proc;
if (mask & AE_WRITABLE) fe->wfileProc = proc;
// 私有数据
fe->clientData = clientData;
// 如果有需要,更新事件处理器的最大 fd
if (fd > eventLoop->maxfd)
eventLoop->maxfd = fd;
return AE_OK;
}
/*
* 将 fd 从 mask 指定的监听队列中删除
*/
void aeDeleteFileEvent(aeEventLoop *eventLoop, int fd, int mask)
{
if (fd >= eventLoop->setsize) return;
// 取出文件事件结构
aeFileEvent *fe = &eventLoop->events[fd];
// 未设置监听的事件类型,直接返回
if (fe->mask == AE_NONE) return;
// 计算新掩码
fe->mask = fe->mask & (~mask);
if (fd == eventLoop->maxfd && fe->mask == AE_NONE) {
/* Update the max fd */
int j;
for (j = eventLoop->maxfd-1; j >= 0; j--)
if (eventLoop->events[j].mask != AE_NONE) break;
eventLoop->maxfd = j;
}
// 取消对给定 fd 的给定事件的监视
aeApiDelEvent(eventLoop, fd, mask);
}
/* Wait for milliseconds until the given file descriptor becomes
* writable/readable/exception
*
* 在给定毫秒内等待,直到 fd 变成可写、可读或异常
*/
int aeWait(int fd, int mask, long long milliseconds) {
struct pollfd pfd;
int retmask = 0, retval;
memset(&pfd, 0, sizeof(pfd));
pfd.fd = fd;
if (mask & AE_READABLE) pfd.events |= POLLIN;
if (mask & AE_WRITABLE) pfd.events |= POLLOUT;
if ((retval = poll(&pfd, 1, milliseconds))== 1) {
if (pfd.revents & POLLIN) retmask |= AE_READABLE;
if (pfd.revents & POLLOUT) retmask |= AE_WRITABLE;
if (pfd.revents & POLLERR) retmask |= AE_WRITABLE;
if (pfd.revents & POLLHUP) retmask |= AE_WRITABLE;
return retmask;
} else {
return retval;
}
}
/*
* 获取可执行事件
*/
static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) {
aeApiState *state = eventLoop->apidata;
int retval, numevents = 0;
// 等待时间
retval = epoll_wait(state->epfd,state->events,eventLoop->setsize,
tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1);
// 有至少一个事件就绪?
if (retval > 0) {
int j;
// 为已就绪事件设置相应的模式
// 并加入到 eventLoop 的 fired 数组中
numevents = retval;
for (j = 0; j < numevents; j++) {
int mask = 0;
struct epoll_event *e = state->events+j;
if (e->events & EPOLLIN) mask |= AE_READABLE;
if (e->events & EPOLLOUT) mask |= AE_WRITABLE;
if (e->events & EPOLLERR) mask |= AE_WRITABLE;
if (e->events & EPOLLHUP) mask |= AE_WRITABLE;
eventLoop->fired[j].fd = e->data.fd;
eventLoop->fired[j].mask = mask;
}
}
// 返回已就绪事件个数
return numevents;
}
int aeProcessEvents(aeEventLoop *eventLoop, int flags)
{
int processed = 0, numevents;
/* Nothing to do? return ASAP */
if (!(flags & AE_TIME_EVENTS) && !(flags & AE_FILE_EVENTS)) return 0;
/* Note that we want call select() even if there are no
* file events to process as long as we want to process time
* events, in order to sleep until the next time event is ready
* to fire. */
if (eventLoop->maxfd != -1 ||
((flags & AE_TIME_EVENTS) && !(flags & AE_DONT_WAIT))) {
int j;
aeTimeEvent *shortest = NULL;
struct timeval tv, *tvp;
// 获取最近的时间事件
if (flags & AE_TIME_EVENTS && !(flags & AE_DONT_WAIT))
shortest = aeSearchNearestTimer(eventLoop);
if (shortest) {
// 如果时间事件存在的话
// 那么根据最近可执行时间事件和现在时间的时间差来决定文件事件的阻塞时间
long now_sec, now_ms;
/* Calculate the time missing for the nearest
* timer to fire. */
// 计算距今最近的时间事件还要多久才能达到
// 并将该时间距保存在 tv 结构中
aeGetTime(&now_sec, &now_ms);
tvp = &tv;
tvp->tv_sec = shortest->when_sec - now_sec;
if (shortest->when_ms < now_ms) {
tvp->tv_usec = ((shortest->when_ms+1000) - now_ms)*1000;
tvp->tv_sec --;
} else {
tvp->tv_usec = (shortest->when_ms - now_ms)*1000;
}
// 时间差小于 0 ,说明事件已经可以执行了,将秒和毫秒设为 0 (不阻塞)
if (tvp->tv_sec < 0) tvp->tv_sec = 0;
if (tvp->tv_usec < 0) tvp->tv_usec = 0;
} else {
// 执行到这一步,说明没有时间事件
// 那么根据 AE_DONT_WAIT 是否设置来决定是否阻塞,以及阻塞的时间长度
/* If we have to check for events but need to return
* ASAP because of AE_DONT_WAIT we need to set the timeout
* to zero */
if (flags & AE_DONT_WAIT) {
// 设置文件事件不阻塞
tv.tv_sec = tv.tv_usec = 0;
tvp = &tv;
} else {
/* Otherwise we can block */
// 文件事件可以阻塞直到有事件到达为止
tvp = NULL; /* wait forever */
}
}
// 处理文件事件,阻塞时间由 tvp 决定
numevents = aeApiPoll(eventLoop, tvp);
for (j = 0; j < numevents; j++) {
// 从已就绪数组中获取事件
aeFileEvent *fe = &eventLoop->events[eventLoop->fired[j].fd];
int mask = eventLoop->fired[j].mask;
int fd = eventLoop->fired[j].fd;
int rfired = 0;
/* note the fe->mask & mask & ... code: maybe an already processed
* event removed an element that fired and we still didn't
* processed, so we check if the event is still valid. */
// 读事件
if (fe->mask & mask & AE_READABLE) {
// rfired 确保读/写事件只能执行其中一个
rfired = 1;
fe->rfileProc(eventLoop,fd,fe->clientData,mask);
}
// 写事件
if (fe->mask & mask & AE_WRITABLE) {
if (!rfired || fe->wfileProc != fe->rfileProc)
fe->wfileProc(eventLoop,fd,fe->clientData,mask);
}
processed++;
}
}
/* Check time events */
// 执行时间事件
if (flags & AE_TIME_EVENTS)
processed += processTimeEvents(eventLoop);
return processed; /* return the number of processed file/time events */
}
/*
* 创建一个新客户端
*/
redisClient *createClient(int fd) {
// 分配空间
redisClient *c = zmalloc(sizeof(redisClient));
/* passing -1 as fd it is possible to create a non connected client.
* This is useful since all the Redis commands needs to be executed
* in the context of a client. When commands are executed in other
* contexts (for instance a Lua script) we need a non connected client. */
// 当 fd 不为 -1 时,创建带网络连接的客户端
// 如果 fd 为 -1 ,那么创建无网络连接的伪客户端
// 因为 Redis 的命令必须在客户端的上下文中使用,所以在执行 Lua 环境中的命令时
// 需要用到这种伪终端
if (fd != -1) {
// 非阻塞
anetNonBlock(NULL,fd);
// 禁用 Nagle 算法
anetEnableTcpNoDelay(NULL,fd);
// 设置 keep alive
if (server.tcpkeepalive)
anetKeepAlive(NULL,fd,server.tcpkeepalive);
// 绑定读事件到事件 loop (开始接收命令请求)
if (aeCreateFileEvent(server.el,fd,AE_READABLE,
readQueryFromClient, c) == AE_ERR)
{
close(fd);
zfree(c);
return NULL;
}
}
// 初始化各个属性
// 默认数据库
selectDb(c,0);
// 套接字
c->fd = fd;
// 名字
c->name = NULL;
// 回复缓冲区的偏移量
c->bufpos = 0;
// 查询缓冲区
c->querybuf = sdsempty();
// 查询缓冲区峰值
c->querybuf_peak = 0;
// 命令请求的类型
c->reqtype = 0;
// 命令参数数量
c->argc = 0;
// 命令参数
c->argv = NULL;
// 当前执行的命令和最近一次执行的命令
c->cmd = c->lastcmd = NULL;
// 查询缓冲区中未读入的命令内容数量
c->multibulklen = 0;
// 读入的参数的长度
c->bulklen = -1;
// 已发送字节数
c->sentlen = 0;
// 状态 FLAG
c->flags = 0;
// 创建时间和最后一次互动时间
c->ctime = c->lastinteraction = server.unixtime;
// 认证状态
c->authenticated = 0;
// 复制状态
c->replstate = REDIS_REPL_NONE;
// 复制偏移量
c->reploff = 0;
// 通过 ACK 命令接收到的偏移量
c->repl_ack_off = 0;
// 通过 AKC 命令接收到偏移量的时间
c->repl_ack_time = 0;
// 客户端为从服务器时使用,记录了从服务器所使用的端口号
c->slave_listening_port = 0;
// 回复链表
c->reply = listCreate();
// 回复链表的字节量
c->reply_bytes = 0;
// 回复缓冲区大小达到软限制的时间
c->obuf_soft_limit_reached_time = 0;
// 回复链表的释放和复制函数
listSetFreeMethod(c->reply,decrRefCountVoid);
listSetDupMethod(c->reply,dupClientReplyValue);
// 阻塞类型
c->btype = REDIS_BLOCKED_NONE;
// 阻塞超时
c->bpop.timeout = 0;
// 造成客户端阻塞的列表键
c->bpop.keys = dictCreate(&setDictType,NULL);
// 在解除阻塞时将元素推入到 target 指定的键中
// BRPOPLPUSH 命令时使用
c->bpop.target = NULL;
c->bpop.numreplicas = 0;
c->bpop.reploffset = 0;
c->woff = 0;
// 进行事务时监视的键
c->watched_keys = listCreate();
// 订阅的频道和模式
c->pubsub_channels = dictCreate(&setDictType,NULL);
c->pubsub_patterns = listCreate();
c->peerid = NULL;
listSetFreeMethod(c->pubsub_patterns,decrRefCountVoid);
listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
// 如果不是伪客户端,那么添加到服务器的客户端链表中
if (fd != -1) listAddNodeTail(server.clients,c);
// 初始化客户端的事务状态
initClientMultiState(c);
// 返回客户端
return c;
}
/*
* 读取客户端的查询缓冲区内容
*/
void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
redisClient *c = (redisClient*) privdata;
int nread, readlen;
size_t qblen;
REDIS_NOTUSED(el);
REDIS_NOTUSED(mask);
// 设置服务器的当前客户端
server.current_client = c;
// 读入长度(默认为 16 MB)
readlen = REDIS_IOBUF_LEN;
/* If this is a multi bulk request, and we are processing a bulk reply
* that is large enough, try to maximize the probability that the query
* buffer contains exactly the SDS string representing the object, even
* at the risk of requiring more read(2) calls. This way the function
* processMultiBulkBuffer() can avoid copying buffers to create the
* Redis Object representing the argument. */
if (c->reqtype == REDIS_REQ_MULTIBULK && c->multibulklen && c->bulklen != -1
&& c->bulklen >= REDIS_MBULK_BIG_ARG)
{
int remaining = (unsigned)(c->bulklen+2)-sdslen(c->querybuf);
if (remaining < readlen) readlen = remaining;
}
// 获取查询缓冲区当前内容的长度
// 如果读取出现 short read ,那么可能会有内容滞留在读取缓冲区里面
// 这些滞留内容也许不能完整构成一个符合协议的命令,
qblen = sdslen(c->querybuf);
// 如果有需要,更新缓冲区内容长度的峰值(peak)
if (c->querybuf_peak < qblen) c->querybuf_peak = qblen;
// 为查询缓冲区分配空间
c->querybuf = sdsMakeRoomFor(c->querybuf, readlen);
// 读入内容到查询缓存
nread = read(fd, c->querybuf+qblen, readlen);
// 读入出错
if (nread == -1) {
if (errno == EAGAIN) {
nread = 0;
} else {
redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
freeClient(c);
return;
}
// 遇到 EOF
} else if (nread == 0) {
redisLog(REDIS_VERBOSE, "Client closed connection");
freeClient(c);
return;
}
if (nread) {
// 根据内容,更新查询缓冲区(SDS) free 和 len 属性
// 并将 '\0' 正确地放到内容的最后
sdsIncrLen(c->querybuf,nread);
// 记录服务器和客户端最后一次互动的时间
c->lastinteraction = server.unixtime;
// 如果客户端是 master 的话,更新它的复制偏移量
if (c->flags & REDIS_MASTER) c->reploff += nread;
} else {
// 在 nread == -1 且 errno == EAGAIN 时运行
server.current_client = NULL;
return;
}
// 查询缓冲区长度超出服务器最大缓冲区长度
// 清空缓冲区并释放客户端
if (sdslen(c->querybuf) > server.client_max_querybuf_len) {
sds ci = catClientInfoString(sdsempty(),c), bytes = sdsempty();
bytes = sdscatrepr(bytes,c->querybuf,64);
redisLog(REDIS_WARNING,"Closing client that reached max query buffer length: %s (qbuf initial bytes: %s)", ci, bytes);
sdsfree(ci);
sdsfree(bytes);
freeClient(c);
return;
}
// 从查询缓存重读取内容,创建参数,并执行命令
// 函数会执行到缓存中的所有内容都被处理完为止
processInputBuffer(c);
server.current_client = NULL;
}
// 处理客户端输入的命令内容
void processInputBuffer(redisClient *c) {
/* Keep processing while there is something in the input buffer */
// 尽可能地处理查询缓冲区中的内容
// 如果读取出现 short read ,那么可能会有内容滞留在读取缓冲区里面
// 这些滞留内容也许不能完整构成一个符合协议的命令,
// 需要等待下次读事件的就绪
while(sdslen(c->querybuf)) {
/* Return if clients are paused. */
// 如果客户端正处于暂停状态,那么直接返回
if (!(c->flags & REDIS_SLAVE) && clientsArePaused()) return;
/* Immediately abort if the client is in the middle of something. */
// REDIS_BLOCKED 状态表示客户端正在被阻塞
if (c->flags & REDIS_BLOCKED) return;
/* REDIS_CLOSE_AFTER_REPLY closes the connection once the reply is
* written to the client. Make sure to not let the reply grow after
* this flag has been set (i.e. don't process more commands). */
// 客户端已经设置了关闭 FLAG ,没有必要处理命令了
if (c->flags & REDIS_CLOSE_AFTER_REPLY) return;
/* Determine request type when unknown. */
// 判断请求的类型
// 两种类型的区别可以在 Redis 的通讯协议上查到:
// http://redis.readthedocs.org/en/latest/topic/protocol.html
// 简单来说,多条查询是一般客户端发送来的,
// 而内联查询则是 TELNET 发送来的
if (!c->reqtype) {
if (c->querybuf[0] == '*') {
// 多条查询
c->reqtype = REDIS_REQ_MULTIBULK;
} else {
// 内联查询
c->reqtype = REDIS_REQ_INLINE;
}
}
// 将缓冲区中的内容转换成命令,以及命令参数
if (c->reqtype == REDIS_REQ_INLINE) {
if (processInlineBuffer(c) != REDIS_OK) break;
} else if (c->reqtype == REDIS_REQ_MULTIBULK) {
if (processMultibulkBuffer(c) != REDIS_OK) break;
} else {
redisPanic("Unknown request type");
}
/* Multibulk processing could see a <= 0 length. */
if (c->argc == 0) {
resetClient(c);
} else {
/* Only reset the client when the command was executed. */
// 执行命令,并重置客户端
if (processCommand(c) == REDIS_OK)
resetClient(c);
}
}
}
复制
/* SYNC ad PSYNC command implemenation. */
void syncCommand(redisClient *c) {
/* ignore SYNC if already slave or in monitor mode */
// 已经是 SLAVE ,或者处于 MONITOR 模式,返回
if (c->flags & REDIS_SLAVE) return;
/* Refuse SYNC requests if we are a slave but the link with our master
* is not ok... */
// 如果这是一个从服务器,但与主服务器的连接仍未就绪,那么拒绝 SYNC
if (server.masterhost && server.repl_state != REDIS_REPL_CONNECTED) {
addReplyError(c,"Can't SYNC while not connected with my master");
return;
}
/* SYNC can't be issued when the server has pending data to send to
* the client about already issued commands. We need a fresh reply
* buffer registering the differences between the BGSAVE and the current
* dataset, so that we can copy to other slaves if needed. */
// 在客户端仍有输出数据等待输出,不能 SYNC
if (listLength(c->reply) != 0 || c->bufpos != 0) {
addReplyError(c,"SYNC and PSYNC are invalid with pending output");
return;
}
redisLog(REDIS_NOTICE,"Slave asks for synchronization");
/* Try a partial resynchronization if this is a PSYNC command.
* 如果这是一个 PSYNC 命令,那么尝试 partial resynchronization 。
*
* If it fails, we continue with usual full resynchronization, however
* when this happens masterTryPartialResynchronization() already
* replied with:
*
* 如果失败,那么使用 full resynchronization ,
* 在这种情况下, masterTryPartialResynchronization() 返回以下内容:
*
* +FULLRESYNC <runid> <offset>
*
* So the slave knows the new runid and offset to try a PSYNC later
* if the connection with the master is lost.
*
* 这样的话,之后如果主服务器断开,那么从服务器就可以尝试 PSYNC 了。
*/
if (!strcasecmp(c->argv[0]->ptr,"psync")) {
// 尝试进行 PSYNC
if (masterTryPartialResynchronization(c) == REDIS_OK) {
// 可执行 PSYNC
server.stat_sync_partial_ok++;
return; /* No full resync needed, return. */
} else {
// 不可执行 PSYNC
char *master_runid = c->argv[1]->ptr;
/* Increment stats for failed PSYNCs, but only if the
* runid is not "?", as this is used by slaves to force a full
* resync on purpose when they are not albe to partially
* resync. */
if (master_runid[0] != '?') server.stat_sync_partial_err++;
}
} else {
/* If a slave uses SYNC, we are dealing with an old implementation
* of the replication protocol (like redis-cli --slave). Flag the client
* so that we don't expect to receive REPLCONF ACK feedbacks. */
// 旧版实现,设置标识,避免接收 REPLCONF ACK
c->flags |= REDIS_PRE_PSYNC;
}
// 以下是完整重同步的情况。。。
/* Full resynchronization. */
// 执行 full resynchronization ,增加计数
server.stat_sync_full++;
/* Here we need to check if there is a background saving operation
* in progress, or if it is required to start one */
// 检查是否有 BGSAVE 在执行
if (server.rdb_child_pid != -1) {
/* Ok a background save is in progress. Let's check if it is a good
* one for replication, i.e. if there is another slave that is
* registering differences since the server forked to save */
redisClient *slave;
listNode *ln;
listIter li;
// 如果有至少一个 slave 在等待这个 BGSAVE 完成
// 那么说明正在进行的 BGSAVE 所产生的 RDB 也可以为其他 slave 所用
listRewind(server.slaves,&li);
while((ln = listNext(&li))) {
slave = ln->value;
if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
}
if (ln) {
/* Perfect, the server is already registering differences for
* another slave. Set the right state, and copy the buffer. */
// 幸运的情况,可以使用目前 BGSAVE 所生成的 RDB
copyClientOutputBuffer(c,slave);
c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
} else {
/* No way, we need to wait for the next BGSAVE in order to
* register differences */
// 不好运的情况,必须等待下个 BGSAVE
c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
}
} else {
/* Ok we don't have a BGSAVE in progress, let's start one */
// 没有 BGSAVE 在进行,开始一个新的 BGSAVE
redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
if (rdbSaveBackground(server.rdb_filename) != REDIS_OK) {
redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
addReplyError(c,"Unable to perform background save");
return;
}
// 设置状态
c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
/* Flush the script cache for the new slave. */
// 因为新 slave 进入,刷新复制脚本缓存
replicationScriptCacheFlush();
}
if (server.repl_disable_tcp_nodelay)
anetDisableTcpNoDelay(NULL, c->fd); /* Non critical if it fails. */
c->repldbfd = -1;
c->flags |= REDIS_SLAVE;
server.slaveseldb = -1; /* Force to re-emit the SELECT command. */
// 添加到 slave 列表中
listAddNodeTail(server.slaves,c);
// 如果是第一个 slave ,那么初始化 backlog
if (listLength(server.slaves) == 1 && server.repl_backlog == NULL)
createReplicationBacklog();
return;
}
/* This function handles the PSYNC command from the point of view of a
* master receiving a request for partial resynchronization.
*
* On success return REDIS_OK, otherwise REDIS_ERR is returned and we proceed
* with the usual full resync. */
// 尝试进行部分 resync ,成功返回 REDIS_OK ,失败返回 REDIS_ERR 。
int masterTryPartialResynchronization(redisClient *c) {
long long psync_offset, psync_len;
char *master_runid = c->argv[1]->ptr;
char buf[128];
int buflen;
/* Is the runid of this master the same advertised by the wannabe slave
* via PSYNC? If runid changed this master is a different instance and
* there is no way to continue. */
// 检查 master id 是否和 runid 一致,只有一致的情况下才有 PSYNC 的可能
if (strcasecmp(master_runid, server.runid)) {
/* Run id "?" is used by slaves that want to force a full resync. */
// 从服务器提供的 run id 和服务器的 run id 不一致
if (master_runid[0] != '?') {
redisLog(REDIS_NOTICE,"Partial resynchronization not accepted: "
"Runid mismatch (Client asked for runid '%s', my runid is '%s')",
master_runid, server.runid);
// 从服务器提供的 run id 为 '?' ,表示强制 FULL RESYNC
} else {
redisLog(REDIS_NOTICE,"Full resync requested by slave.");
}
// 需要 full resync
goto need_full_resync;
}
/* We still have the data our slave is asking for? */
// 取出 psync_offset 参数
if (getLongLongFromObjectOrReply(c,c->argv[2],&psync_offset,NULL) !=
REDIS_OK) goto need_full_resync;
// 如果没有 backlog
if (!server.repl_backlog ||
// 或者 psync_offset 小于 server.repl_backlog_off
// (想要恢复的那部分数据已经被覆盖)
psync_offset < server.repl_backlog_off ||
// psync offset 大于 backlog 所保存的数据的偏移量
psync_offset > (server.repl_backlog_off + server.repl_backlog_histlen))
{
// 执行 FULL RESYNC
redisLog(REDIS_NOTICE,
"Unable to partial resync with the slave for lack of backlog (Slave request was: %lld).", psync_offset);
if (psync_offset > server.master_repl_offset) {
redisLog(REDIS_WARNING,
"Warning: slave tried to PSYNC with an offset that is greater than the master replication offset.");
}
goto need_full_resync;
}
/* If we reached this point, we are able to perform a partial resync:
* 程序运行到这里,说明可以执行 partial resync
*
* 1) Set client state to make it a slave.
* 将客户端状态设为 salve
*
* 2) Inform the client we can continue with +CONTINUE
* 向 slave 发送 +CONTINUE ,表示 partial resync 的请求被接受
*
* 3) Send the backlog data (from the offset to the end) to the slave.
* 发送 backlog 中,客户端所需要的数据
*/
c->flags |= REDIS_SLAVE;
c->replstate = REDIS_REPL_ONLINE;
c->repl_ack_time = server.unixtime;
listAddNodeTail(server.slaves,c);
/* We can't use the connection buffers since they are used to accumulate
* new commands at this stage. But we are sure the socket send buffer is
* emtpy so this write will never fail actually. */
// 向从服务器发送一个同步 +CONTINUE ,表示 PSYNC 可以执行
buflen = snprintf(buf,sizeof(buf),"+CONTINUE\r\n");
if (write(c->fd,buf,buflen) != buflen) {
freeClientAsync(c);
return REDIS_OK;
}
// 发送 backlog 中的内容(也即是从服务器缺失的那些内容)到从服务器
psync_len = addReplyReplicationBacklog(c,psync_offset);
redisLog(REDIS_NOTICE,
"Partial resynchronization request accepted. Sending %lld bytes of backlog starting from offset %lld.", psync_len, psync_offset);
/* Note that we don't need to set the selected DB at server.slaveseldb
* to -1 to force the master to emit SELECT, since the slave already
* has this state from the previous connection with the master. */
// 刷新低延迟从服务器的数量
refreshGoodSlavesCount();
return REDIS_OK; /* The caller can return, no full resync needed. */
need_full_resync:
/* We need a full resync for some reason... notify the client. */
// 刷新 psync_offset
psync_offset = server.master_repl_offset;
/* Add 1 to psync_offset if it the replication backlog does not exists
* as when it will be created later we'll increment the offset by one. */
// 刷新 psync_offset
if (server.repl_backlog == NULL) psync_offset++;
/* Again, we can't use the connection buffers (see above). */
// 发送 +FULLRESYNC ,表示需要完整重同步
buflen = snprintf(buf,sizeof(buf),"+FULLRESYNC %s %lld\r\n",
server.runid,psync_offset);
if (write(c->fd,buf,buflen) != buflen) {
freeClientAsync(c);
return REDIS_OK;
}
return REDIS_ERR;
}
/* Feed the slave 'c' with the replication backlog starting from the
* specified 'offset' up to the end of the backlog. */
// 向从服务器 c 发送 backlog 中从 offset 到 backlog 尾部之间的数据
long long addReplyReplicationBacklog(redisClient *c, long long offset) {
long long j, skip, len;
redisLog(REDIS_DEBUG, "[PSYNC] Slave request offset: %lld", offset);
if (server.repl_backlog_histlen == 0) {
redisLog(REDIS_DEBUG, "[PSYNC] Backlog history len is zero");
return 0;
}
redisLog(REDIS_DEBUG, "[PSYNC] Backlog size: %lld",
server.repl_backlog_size);
redisLog(REDIS_DEBUG, "[PSYNC] First byte: %lld",
server.repl_backlog_off);
redisLog(REDIS_DEBUG, "[PSYNC] History len: %lld",
server.repl_backlog_histlen);
redisLog(REDIS_DEBUG, "[PSYNC] Current index: %lld",
server.repl_backlog_idx);
/* Compute the amount of bytes we need to discard. */
skip = offset - server.repl_backlog_off;
redisLog(REDIS_DEBUG, "[PSYNC] Skipping: %lld", skip);
/* Point j to the oldest byte, that is actaully our
* server.repl_backlog_off byte. */
j = (server.repl_backlog_idx +
(server.repl_backlog_size-server.repl_backlog_histlen)) %
server.repl_backlog_size;
redisLog(REDIS_DEBUG, "[PSYNC] Index of first byte: %lld", j);
/* Discard the amount of data to seek to the specified 'offset'. */
j = (j + skip) % server.repl_backlog_size;
/* Feed slave with data. Since it is a circular buffer we have to
* split the reply in two parts if we are cross-boundary. */
len = server.repl_backlog_histlen - skip;
redisLog(REDIS_DEBUG, "[PSYNC] Reply total length: %lld", len);
while(len) {
long long thislen =
((server.repl_backlog_size - j) < len) ?
(server.repl_backlog_size - j) : len;
redisLog(REDIS_DEBUG, "[PSYNC] addReply() length: %lld", thislen);
addReplySds(c,sdsnewlen(server.repl_backlog + j, thislen));
len -= thislen;
j = 0;
}
return server.repl_backlog_histlen - skip;
}
codis
func (s *Proxy) serveProxy() {
if s.IsClosed() {
return
}
defer s.Close()
log.Warnf("[%p] proxy start service on %s", s, s.lproxy.Addr())
eh := make(chan error, 1)
go func(l net.Listener) (err error) {
defer func() {
eh <- err
}()
for {
c, err := s.acceptConn(l)
if err != nil {
return err
}
NewSession(c, s.config).Start(s.router)
}
}(s.lproxy)
if d := s.config.BackendPingPeriod.Duration(); d != 0 {
go s.keepAlive(d)
}
select {
case <-s.exit.C:
log.Warnf("[%p] proxy shutdown", s)
case err := <-eh:
log.ErrorErrorf(err, "[%p] proxy exit on error", s)
}
}
func (s *Session) Start(d *Router) {
s.start.Do(func() {
if int(incrSessions()) > s.config.ProxyMaxClients {
go func() {
s.Conn.Encode(redis.NewErrorf("ERR max number of clients reached"), true)
s.CloseWithError(ErrTooManySessions)
s.incrOpFails(nil, nil)
s.flushOpStats(true)
}()
decrSessions()
return
}
if !d.isOnline() {
go func() {
s.Conn.Encode(redis.NewErrorf("ERR router is not online"), true)
s.CloseWithError(ErrRouterNotOnline)
s.incrOpFails(nil, nil)
s.flushOpStats(true)
}()
decrSessions()
return
}
tasks := NewRequestChanBuffer(1024)
go func() {
s.loopWriter(tasks)
decrSessions()
}()
go func() {
s.loopReader(tasks, d)
tasks.Close()
}()
})
}
func (s *Session) loopReader(tasks *RequestChan, d *Router) (err error) {
defer func() {
s.CloseReaderWithError(err)
}()
var (
breakOnFailure = s.config.SessionBreakOnFailure
maxPipelineLen = s.config.SessionMaxPipeline
)
for !s.quit {
multi, err := s.Conn.DecodeMultiBulk()
if err != nil {
return err
}
if len(multi) == 0 {
continue
}
s.incrOpTotal()
if tasks.Buffered() > maxPipelineLen {
return s.incrOpFails(nil, ErrTooManyPipelinedRequests)
}
start := time.Now()
s.LastOpUnix = start.Unix()
s.Ops++
r := &Request{}
r.Multi = multi
r.Batch = &sync.WaitGroup{}
r.Database = s.database
r.UnixNano = start.UnixNano()
if err := s.handleRequest(r, d); err != nil {
r.Resp = redis.NewErrorf("ERR handle request, %s", err)
tasks.PushBack(r)
if breakOnFailure {
return err
}
} else {
tasks.PushBack(r)
}
}
return nil
}
func (s *Session) handleRequest(r *Request, d *Router) error {
opstr, flag, err := getOpInfo(r.Multi)
if err != nil {
return err
}
r.OpStr = opstr
r.OpFlag = flag
r.Broken = &s.broken
if flag.IsNotAllowed() {
return fmt.Errorf("command '%s' is not allowed", opstr)
}
switch opstr {
case "QUIT":
return s.handleQuit(r)
case "AUTH":
return s.handleAuth(r)
}
if !s.authorized {
if s.config.SessionAuth != "" {
r.Resp = redis.NewErrorf("NOAUTH Authentication required")
return nil
}
s.authorized = true
}
switch opstr {
case "SELECT":
return s.handleSelect(r)
case "PING":
return s.handleRequestPing(r, d)
case "INFO":
return s.handleRequestInfo(r, d)
case "MGET":
return s.handleRequestMGet(r, d)
case "MSET":
return s.handleRequestMSet(r, d)
case "DEL":
return s.handleRequestDel(r, d)
case "EXISTS":
return s.handleRequestExists(r, d)
case "SLOTSINFO":
return s.handleRequestSlotsInfo(r, d)
case "SLOTSSCAN":
return s.handleRequestSlotsScan(r, d)
case "SLOTSMAPPING":
return s.handleRequestSlotsMapping(r, d)
default:
return d.dispatch(r)
}
}
func (s *Router) dispatch(r *Request) error {
hkey := getHashKey(r.Multi, r.OpStr)
var id = Hash(hkey) % MaxSlotNum
slot := &s.slots[id]
return slot.forward(r, hkey)
}
func (d *forwardSync) Forward(s *Slot, r *Request, hkey []byte) error {
s.lock.RLock()
bc, err := d.process(s, r, hkey)
s.lock.RUnlock()
if err != nil {
return err
}
bc.PushBack(r)
return nil
}
func (s *Session) loopWriter(tasks *RequestChan) (err error) {
defer func() {
s.CloseWithError(err)
tasks.PopFrontAllVoid(func(r *Request) {
s.incrOpFails(r, nil)
})
s.flushOpStats(true)
}()
var (
breakOnFailure = s.config.SessionBreakOnFailure
maxPipelineLen = s.config.SessionMaxPipeline
)
p := s.Conn.FlushEncoder()
p.MaxInterval = time.Millisecond
p.MaxBuffered = maxPipelineLen / 2
return tasks.PopFrontAll(func(r *Request) error {
resp, err := s.handleResponse(r)
if err != nil {
resp = redis.NewErrorf("ERR handle response, %s", err)
if breakOnFailure {
s.Conn.Encode(resp, true)
return s.incrOpFails(r, err)
}
}
if err := p.Encode(resp); err != nil {
return s.incrOpFails(r, err)
}
fflush := tasks.IsEmpty()
if err := p.Flush(fflush); err != nil {
return s.incrOpFails(r, err)
} else {
s.incrOpStats(r, resp.Type)
}
if fflush {
s.flushOpStats(false)
}
return nil
})
}
事务