edis中的epoll和socket 封装应用

epoll在redis中的应用

epoll是linux的一个多路复用的库,其实每个对于epoll的用法很固定,只是在不同的reactor模型下每个事件循环EventLoop会有所不同。

epoll

epoll相比于poll select来说会更为高效,主要体现在:

  • 减少了用户态和内核态的文件描述符的拷贝
  • 减少了对可读可写文件句柄的遍历
  • IO性能不会随着监听的文件描述的数量增长而下降
  • 使用红黑树存储fd,以及对应的回调函数,其插入,查找,删除的性能不错,相比于hash,不必预先分配很多的空间

对于很多博客都说,epoll中有使用mmap实现用户态和内核态之间共享同一个文件描述符,直接贴上

(48 封私信 / 81 条消息) epoll实现中共享内存问题? - 知乎 (zhihu.com)

redis对epoll的封装
  • 主要的数据结构

    typedef struct aeApiState {
        int epfd;						// epoll对应的文件描述符
        struct epoll_event *events;		 // 就绪事件队列
    } aeApiState;
    
  • 内存分配接口

    • static int aeApiCreate(aeEventLoop *eventLoop)
    • static int aeApiResize(aeEventLoop *eventLoop, int setsize)
    • static void aeApiFree(aeEventLoop *eventLoop)
    static int aeApiCreate(aeEventLoop *eventLoop) {
        aeApiState *state = zmalloc(sizeof(aeApiState));
    
        if (!state) return -1;
        state->events = zmalloc(sizeof(struct epoll_event)*eventLoop->setsize);
        if (!state->events) {
            zfree(state);
            return -1;
        }
        // 要注意的是这个epoll_create的参数
        // 这个参数对于内核来说并不起绝对的作用,已知一个epoll就能够同时监听上百万的连接(看别人示范过)
        state->epfd = epoll_create(1024); /* 1024 is just a hint for the kernel */
        if (state->epfd == -1) {
            zfree(state->events);
            zfree(state);
            return -1;
        }
        anetCloexec(state->epfd);
        eventLoop->apidata = state;
        return 0;
    }
    
    static int aeApiResize(aeEventLoop *eventLoop, int setsize) {
        aeApiState *state = eventLoop->apidata;
    
        state->events = zrealloc(state->events, sizeof(struct epoll_event)*setsize);
        return 0;
    }
    
    static void aeApiFree(aeEventLoop *eventLoop) {
        aeApiState *state = eventLoop->apidata;
    
        close(state->epfd);
        zfree(state->events);
        zfree(state);
    }
    
  • 添加、修改和删除操作

    • static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask)

      static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) {
          aeApiState *state = eventLoop->apidata;
          struct epoll_event ee = {0}; /* avoid valgrind warning */
          /* If the fd was already monitored for some event, we need a MOD
           * operation. Otherwise we need an ADD operation. */
          int op = eventLoop->events[fd].mask == AE_NONE ?
                  EPOLL_CTL_ADD : EPOLL_CTL_MOD;
      
          ee.events = 0;
          mask |= eventLoop->events[fd].mask; /* Merge old events */
          if (mask & AE_READABLE) ee.events |= EPOLLIN;
          if (mask & AE_WRITABLE) ee.events |= EPOLLOUT;
          ee.data.fd = fd;
          if (epoll_ctl(state->epfd,op,fd,&ee) == -1) return -1;
          return 0;
      }
      
    • static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int delmask)

      static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int delmask) {
          aeApiState *state = eventLoop->apidata;
          struct epoll_event ee = {0}; /* avoid valgrind warning */
          int mask = eventLoop->events[fd].mask & (~delmask);
      
          ee.events = 0;
          if (mask & AE_READABLE) ee.events |= EPOLLIN;
          if (mask & AE_WRITABLE) ee.events |= EPOLLOUT;
          ee.data.fd = fd;
          if (mask != AE_NONE) {
              epoll_ctl(state->epfd,EPOLL_CTL_MOD,fd,&ee);
          } else {
              /* Note, Kernel < 2.6.9 requires a non null event pointer even for
               * EPOLL_CTL_DEL. */
              epoll_ctl(state->epfd,EPOLL_CTL_DEL,fd,&ee);
          }
      }
      
    • static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp)

      调用epoll_wait的接口

      static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) {
          aeApiState *state = eventLoop->apidata;
          int retval, numevents = 0;
      
          retval = epoll_wait(state->epfd,state->events,eventLoop->setsize,
                  tvp ? (tvp->tv_sec*1000 + (tvp->tv_usec + 999)/1000) : -1);
          if (retval > 0) {
              int j;
      
              numevents = retval;
              for (j = 0; j < numevents; j++) {
                  int mask = 0;
                  struct epoll_event *e = state->events+j;
      
                  if (e->events & EPOLLIN) mask |= AE_READABLE;
                  if (e->events & EPOLLOUT) mask |= AE_WRITABLE;
                  if (e->events & EPOLLERR) mask |= AE_WRITABLE|AE_READABLE;
                  if (e->events & EPOLLHUP) mask |= AE_WRITABLE|AE_READABLE;
                  eventLoop->fired[j].fd = e->data.fd;
                  eventLoop->fired[j].mask = mask;
              }
          } else if (retval == -1 && errno != EINTR) {
              panic("aeApiPoll: epoll_wait, %s", strerror(errno));
          }
      
          return numevents;
      }
      

感觉有点过于水了,顺便在看看redis对socket的封装

redis对socket的封装

从socket.c的介绍中:connection不提供高级的输输入输出缓冲区的功能,这些高级的功能都在networking.c中实现。connection主要的目的是为了能够公开透明的处理tcp和tls(这个协议我也不太了解,与关ssl)。

  1. 一个连接相对应的套接字存在之前能够存活。这允许各种配置和上下文(context)在一个连接真正的建立之前被设置。
  2. 当连接有数据需要读或者写, 调用者(caller)可能会注册逻辑的读写函数。这些决定于实现的逻辑上的函数可能不会正确地响应真正的ae事件,

(本人英文不太好, 直接贴源码)

/* The connections module provides a lean abstraction of network connections
 * to avoid direct socket and async event management across the Redis code base.
 *
 * It does NOT provide advanced connection features commonly found in similar
 * libraries such as complete in/out buffer management, throttling, etc. These
 * functions remain in networking.c.
 *
 * The primary goal is to allow transparent handling of TCP and TLS based
 * connections. To do so, connections have the following properties:
 *
 * 1. A connection may live before its corresponding socket exists.  This
 *    allows various context and configuration setting to be handled before
 *    establishing the actual connection.
 * 2. The caller may register/unregister logical read/write handlers to be
 *    called when the connection has data to read from/can accept writes.
 *    These logical handlers may or may not correspond to actual AE events,
 *    depending on the implementation (for TCP they are; for TLS they aren't).
 */

在socket.c文件中提供的接口,主要是对connection中的socket fd的一些属性(socketopt)进行设置

int connBlock(connection *conn)
int connNonBlock(connection *conn);
int connEnableTcpNoDelay(connection *conn);
int connDisableTcpNoDelay(connection *conn);
int connKeepAlive(connection *conn, int interval);
int connSendTimeout(connection *conn, long long ms);
int connRecvTimeout(connection *conn, long long ms);
int RedisRegisterConnectionTypeSocket(void);
  • int connBlock(connection *conn)int connNonBlock(connection *conn)

    /* socket.c */
    int connBlock(connection *conn) {
        if (conn->fd == -1) return C_ERR;
        return anetBlock(NULL, conn->fd);
    }
    int connNonBlock(connection *conn) {
        if (conn->fd == -1) return C_ERR;
        return anetNonBlock(NULL, conn->fd);
    }
    /* anet.c */
    int anetBlock(char *err, int fd) {
        return anetSetBlock(err,fd,0);
    }
    int anetNonBlock(char *err, int fd) {
        return anetSetBlock(err,fd,1);
    }
    
    int anetSetBlock(char *err, int fd, int non_block) {
        int flags;
    
        /* Set the socket blocking (if non_block is zero) or non-blocking.
         * Note that fcntl(2) for F_GETFL and F_SETFL can't be
         * interrupted by a signal. */
        // 调用了fcntl获取fd的信息
        if ((flags = fcntl(fd, F_GETFL)) == -1) {
            anetSetError(err, "fcntl(F_GETFL): %s", strerror(errno));
            return ANET_ERR;
        }
    
        /* Check if this flag has been set or unset, if so, 
         * then there is no need to call fcntl to set/unset it again. */
        // 这里的双感叹号是将一个值转换为布尔类型的常用方式。
        if (!!(flags & O_NONBLOCK) == !!non_block)
            return ANET_OK;
    	
        if (non_block)		// 设置为阻塞
            //这里有一个细节:在设置这些fd文件描述符的位的时候需要注意使用 |= 
            // 而不是直接 &= 因为这样会将其他的位上的数据全部清除
            flags |= O_NONBLOCK;
        else				// 设置为非阻塞
            // 为什么这里用的是 &= 呢
            // 000001 经过逻辑非运算之后就会变成 111110
            // 这样其他位并不会对掩码上的数据产生影响
            flags &= ~O_NONBLOCK;
    	// 将新的flags设置上去
        if (fcntl(fd, F_SETFL, flags) == -1) {
            anetSetError(err, "fcntl(F_SETFL,O_NONBLOCK): %s", strerror(errno));
            return ANET_ERR;
        }
        return ANET_OK;
    }
    
  • int connEnableTcpNoDelay(connection *conn)int connDisableTcpNoDelay(connection *conn)

    是否启用tcp的Nagle算法

    /* socket.c */
    int connEnableTcpNoDelay(connection *conn) {
        if (conn->fd == -1) return C_ERR;
        return anetEnableTcpNoDelay(NULL, conn->fd);
    }
    int connDisableTcpNoDelay(connection *conn) {
        if (conn->fd == -1) return C_ERR;
        return anetDisableTcpNoDelay(NULL, conn->fd);
    }
    
    /* anet.c */
    int anetEnableTcpNoDelay(char *err, int fd)
    {
        return anetSetTcpNoDelay(err, fd, 1);
    }
    int anetDisableTcpNoDelay(char *err, int fd)
    {
        return anetSetTcpNoDelay(err, fd, 0);
    }
    static int anetSetTcpNoDelay(char *err, int fd, int val)
    {
        if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &val, sizeof(val)) == -1)
        {
            anetSetError(err, "setsockopt TCP_NODELAY: %s", strerror(errno));
            return ANET_ERR;
        }
        return ANET_OK;
    }
    
    

    实现也比较简单,简单讲一下tcp的Nagle算法(Unix网络编程 卷1 P172)

    tcp的Nagle算法

    Nagle算法的目的在于减少广域网WAN上的小分组数目。该算法指出:如果某个给定连接上有待确认数据outstanding data,直到现有数据被确认为止。Nagel算法的目的在于防止一个连接在任何时刻有多个小分组待确认。

    Nagle算法常常于另一个TCP算法联合使用:ACK延滞算法 delayed ACK algorithm。该算法使得TCP在接受到数据后不会立即发送ACK,而是等待一小段时间后,才发送ACK。

  • int connKeepAlive(connection *conn, int interval)

    主要是有关socketopt的SO_KEEPALIVE选项

    给一个TCP套接字设置保持存活keep-alive选项后,如果在2个小时内在该套接字的任何一方向上都没有数据交换,TCP就会自动给对端发送一个保持存活探测分节keep-alive prode。这式一个对端必须相应的TCP分节,它就会导致以下三种情况之一:

    • 对端以期望的ACK相应。
    • 对端以RST相应,它告知本端TCP:对端已崩溃重启且已重新启动。该套接字的待处理错误被设置为ECONNRESET,套接字本身则被关闭。
    • 对端对保持存活探测分节没有任何响应。源自Berekeley的TCP将另外发送8个探测分节,两两相隔75秒,视图得到一个响应。TCP在发出的第一个探测分节后11分15秒内若没有得到任何的响应则放弃。
    /* socket.c */
    int connKeepAlive(connection *conn, int interval) {
        if (conn->fd == -1) return C_ERR;
        return anetKeepAlive(NULL, conn->fd, interval);
    }
    /* anet.c */
    /* Set TCP keep alive option to detect dead peers. The interval option
     * is only used for Linux as we are using Linux-specific APIs to set
     * the probe send time, interval, and count. */
    int anetKeepAlive(char *err, int fd, int interval)
    {
        int val = 1;
        if (setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &val, sizeof(val)) == -1)
        {
            anetSetError(err, "setsockopt SO_KEEPALIVE: %s", strerror(errno));
            return ANET_ERR;
        }
    
    #ifdef __linux__
        /* Default settings are more or less garbage, with the keepalive time
         * set to 7200 by default on Linux. Modify settings to make the feature
         * actually useful. */
    
        /* Send first probe after interval. */
        val = interval;
        if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPIDLE, &val, sizeof(val)) < 0) {
            anetSetError(err, "setsockopt TCP_KEEPIDLE: %s\n", strerror(errno));
            return ANET_ERR;
        }
    
        /* Send next probes after the specified interval. Note that we set the
         * delay as interval / 3, as we send three probes before detecting
         * an error (see the next setsockopt call). */
        val = interval/3;
        if (val == 0) val = 1;
        if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPINTVL, &val, sizeof(val)) < 0) {
            anetSetError(err, "setsockopt TCP_KEEPINTVL: %s\n", strerror(errno));
            return ANET_ERR;
        }
    
        /* Consider the socket in error state after three we send three ACK
         * probes without getting a reply. */
        val = 3;
        if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPCNT, &val, sizeof(val)) < 0) {
            anetSetError(err, "setsockopt TCP_KEEPCNT: %s\n", strerror(errno));
            return ANET_ERR;
        }
            return ANET_OK;
    }
    
  • int connSendTimeout(connection *conn, long long ms)

    int connRecvTimeout(connection *conn, long long ms)

    /* socket.c */
    int connSendTimeout(connection *conn, long long ms) {
        return anetSendTimeout(NULL, conn->fd, ms);
    }
    
    int connRecvTimeout(connection *conn, long long ms) {
        return anetRecvTimeout(NULL, conn->fd, ms);
    }
    /* anet.c */
    /* Set the socket send timeout (SO_SNDTIMEO socket option) to the specified
     * number of milliseconds, or disable it if the 'ms' argument is zero. */
    int anetSendTimeout(char *err, int fd, long long ms) {
        struct timeval tv;
    
        tv.tv_sec = ms/1000;
        tv.tv_usec = (ms%1000)*1000;
        if (setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)) == -1) {
            anetSetError(err, "setsockopt SO_SNDTIMEO: %s", strerror(errno));
            return ANET_ERR;
        }
        return ANET_OK;
    }
    
    /* Set the socket receive timeout (SO_RCVTIMEO socket option) to the specified
     * number of milliseconds, or disable it if the 'ms' argument is zero. */
    int anetRecvTimeout(char *err, int fd, long long ms) {
        struct timeval tv;
    
        tv.tv_sec = ms/1000;
        tv.tv_usec = (ms%1000)*1000;
        if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)) == -1) {
            anetSetError(err, "setsockopt SO_RCVTIMEO: %s", strerror(errno));
            return ANET_ERR;
        }
        return ANET_OK;
    }
    
  • int RedisRegisterConnectionTypeSocket(void):这个函数有点特殊,他是初始化一个全局变量

    /* socket.c */
    static ConnectionType CT_Socket;
    int RedisRegisterConnectionTypeSocket(void)
    {
        return connTypeRegister(&CT_Socket);
    }
    /* connection.c */
    static ConnectionType *connTypes[CONN_TYPE_MAX];
    int connTypeRegister(ConnectionType *ct) {
        const char *typename = ct->get_type(NULL);
        ConnectionType *tmpct;
        int type;
    
        /* find an empty slot to store the new connection type */
        for (type = 0; type < CONN_TYPE_MAX; type++) {
            tmpct = connTypes[type];
            if (!tmpct)
                break;
    
            /* ignore case, we really don't care "tls"/"TLS" */
            if (!strcasecmp(typename, tmpct->get_type(NULL))) {
                serverLog(LL_WARNING, "Connection types %s already registered", typename);
                return C_ERR;
            }
        }
    
        serverLog(LL_VERBOSE, "Connection type %s registered", typename);
        connTypes[type] = ct;
    	// 调用connectintype内部的init函数进行初始化
        if (ct->init) {
            ct->init();
        }
    
        return C_OK;
    }
    
ConnectionType 的相关内容

在socket.c中定义了一个全局静态变量,也就是上面说到的static ConnectionType CT_Socket;

  • 先来介绍以下这个CT_Socket

    /* 给出在socket.c中CT_Socket的属性值,全部都是函数指针, */
    /* 阅读源码就会发现,在CT_Socket中的函数指针指向的函数都是在该初始化上方实现的函数, */
    /* 作为一个回调函数和connectiontype这个结构绑定在一起 */
    /* 看了一下有一些函数是比较简单的可以直接在这里讲清楚,其他的单独分开讲 */
    static ConnectionType CT_Socket = {
        /* connection type */
        .get_type = connSocketGetType,	
    
        /* connection type initialize & finalize & configure */
        .init = NULL,		
        .cleanup = NULL,
        .configure = NULL,
    
        /* ae & accept & listen & error & address handler */
        .ae_handler = connSocketEventHandler,
        .accept_handler = connSocketAcceptHandler,
        .addr = connSocketAddr,		// 一个简单的类型转换函数,将socket的ip和端口转换为字符串,方便输出
        .is_local = connSocketIsLocal,	// 判断该socket是否是本地的一个socket,调用上面的connSocketAddr
        							  // 再判断开头是否是“127”即可
        .listen = connSocketListen,		
    
        /* create/shutdown/close connection */
        .conn_create = connCreateSocket,	// 创建一个connection对象,空对象,
        // 内部调用connCreateSocket,并设置一个已经连接的socket 
        .conn_create_accepted = connCreateAcceptedSocket,	
        .shutdown = connSocketShutdown,	// 内部调用shutdown关闭连接 shutdown(conn->fd, SHUT_RDWR);
        .close = connSocketClose,
    
        /* connect & accept */
        .connect = connSocketConnect,
        .blocking_connect = connSocketBlockingConnect,	// 创建一个阻塞的socket,
        											// 其实就是将一个非阻塞的socket设置成阻塞的
        .accept = connSocketAccept,	// 修改connection的status,并且回调一下相应的函数
    
        /* IO */
        .write = connSocketWrite,
        .writev = connSocketWritev,
        .read = connSocketRead,
        .set_write_handler = connSocketSetWriteHandler,
        .set_read_handler = connSocketSetReadHandler,
        .get_last_error = connSocketGetLastError,
        .sync_write = connSocketSyncWrite,
        .sync_read = connSocketSyncRead,
        .sync_readline = connSocketSyncReadLine,
    
        /* pending data */
        .has_pending_data = NULL,
        .process_pending_data = NULL,
    };
    

看以下connection的状态

typedef enum {
    CONN_STATE_NONE = 0,		// 没有连接,通过文件头可知,connection实例化的时候,
    						  // connection不一定建立了对应的socket
    CONN_STATE_CONNECTING,		// 连接建立中
    CONN_STATE_ACCEPTING,		// 连接在建立成功,等待服务器分配一个socket fd,
    						  // 全连接队列中等待分配
    CONN_STATE_CONNECTED,		// 连接已建立,能够进行双向通信
    CONN_STATE_CLOSED,			// 连接关闭
    CONN_STATE_ERROR			// 连接错误
} ConnectionState;

现在分析以下在CT_Socket中绑定的回调函数的具体的实现

connection type
  • .get_type = connSocketGetType,返回这个连接的具体类型,因为只支持tcp和tsl,所以返回tcp

    static const char *connSocketGetType(connection *conn) {
        (void) conn;
        return CONN_TYPE_SOCKET;	/* tcp */
    }
    
ae & accept & listen & error & address handler
  • .ae_handler = connSocketEventHandler, 这个函数的主要作用是将connection与相应的io读写回调函数进行关联

    static void connSocketEventHandler(struct aeEventLoop *el, int fd, void *clientData, int mask)
    {
        /* 不用管,没什么用, 展开就是((void) x)  */
        UNUSED(el);
        UNUSED(fd);
        connection *conn = clientData;	// 为每个连接分配一个读写缓冲区
        							 // 由于redis的逻辑处理模型使用的是单线程模型
        							 // 所以只是有一个缓冲区就足以处理
        // 设置connection的状态
        if (conn->state == CONN_STATE_CONNECTING &&
                (mask & AE_WRITABLE) && conn->conn_handler) {
            int conn_error = anetGetError(conn->fd);
            if (conn_error) {
                conn->last_errno = conn_error;
                conn->state = CONN_STATE_ERROR;
            } else {
                conn->state = CONN_STATE_CONNECTED;
            }
            if (!conn->write_handler) aeDeleteFileEvent(server.el,conn->fd,AE_WRITABLE);
            if (!callHandler(conn, conn->conn_handler)) return;
            conn->conn_handler = NULL;
        }
    
        // redis server一般是先处理读事件,再处理写事件
        // 当这个CONN_FLAG_WRITE_BARRIER设置后,
        //redis server会先处理该连接上的写事件,再处理该连接上的对事件
        int invert = conn->flags & CONN_FLAG_WRITE_BARRIER;
    	
        int call_write = (mask & AE_WRITABLE) && conn->write_handler;
        int call_read = (mask & AE_READABLE) && conn->read_handler;
    	
        /* Handle normal I/O flows */
        // 将对应的io读写事件和相应的connection进行
        if (!invert && call_read) {
            if (!callHandler(conn, conn->read_handler)) return;
        }
        /* Fire the writable event. */
        if (call_write) {
            if (!callHandler(conn, conn->write_handler)) return;
        }
        /* If we have to invert the call, fire the readable event now
         * after the writable one. */
        if (invert && call_read) {
            if (!callHandler(conn, conn->read_handler)) return;
        }
    }
    
    /* connhelpers.h */
    static inline void connIncrRefs(connection *conn) {
        conn->refs++;
    }
    static inline void connDecrRefs(connection *conn) {
        conn->refs--;
    }
    static inline int connHasRefs(connection *conn) {
        return conn->refs;
    }
    /* Helper for connection implementations to call handlers:
     * 1. Increment refs to protect the connection.
     * 2. Execute the handler (if set).
     * 3. Decrement refs and perform deferred close, if refs==0.
     */
    static inline int callHandler(connection *conn, ConnectionCallbackFunc handler) {
        connIncrRefs(conn);
        if (handler) handler(conn);
        connDecrRefs(conn);
        if (conn->flags & CONN_FLAG_CLOSE_SCHEDULED) {
            // 如果该连接的引用计数为0,则关闭连接
            if (!connHasRefs(conn)) connClose(conn);
            return 0;
        }
        return 1;
    }
    
  • .accept_handler = connSocketAcceptHandler,

    核心函数:accept

    static void connSocketAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
        int cport, cfd, max = MAX_ACCEPTS_PER_CALL;
        char cip[NET_IP_STR_LEN];
        UNUSED(el);
        UNUSED(mask);
        UNUSED(privdata);
    
        while(max--) {
            cfd = anetTcpAccept(server.neterr, fd, cip, sizeof(cip), &cport);
            if (cfd == ANET_ERR) {
                if (errno != EWOULDBLOCK) /* EWOULDBLOCK 套接字阻塞的时候返回 */
                    serverLog(LL_WARNING,
                        "Accepting client connection: %s", server.neterr);
                return;
            }
            serverLog(LL_VERBOSE,"Accepted %s:%d", cip, cport);
            /* connCreateAcceptSocket() 为新分配的socket fd创建一个socket对象,并封装在connection对象中 */
            /* acceptCommonHandler 有点长,这里就不贴了,直接讲一下他干了那些事情 */
            // 主要是为accpet出来的新的fd创建一个connection对象
            // 1.为fd分配一个connection对象,并检查和设置connection对象的status
            // 2.判断当前服务器的connetion对象是否已经超过了限制
            // 3.初始化该connetion客户端的相关属性(维护这个client)
            // 4.设置client的flags -> c->flags |= flags;
            // 5.回调一个clientAccept函数处理相关的东西,就是在accept之后需要做那些事情
            acceptCommonHandler(connCreateAcceptedSocket(cfd, NULL),0,cip);
        }
    }
    
    /* anet.c */
    /* Accept a connection and also make sure the socket is non-blocking, and CLOEXEC.
     * returns the new socket FD, or -1 on error. */
    int anetTcpAccept(char *err, int serversock, char *ip, size_t ip_len, int *port) {
        int fd;
        struct sockaddr_storage sa;
        socklen_t salen = sizeof(sa);
        if ((fd = anetGenericAccept(err,serversock,(struct sockaddr*)&sa,&salen)) == ANET_ERR)
            return ANET_ERR;
    
        if (sa.ss_family == AF_INET) {		// 是否是ipv4的协议族,我记得AF_INET好像是一个流式传输的协议族
            struct sockaddr_in *s = (struct sockaddr_in *)&sa;
            if (ip) inet_ntop(AF_INET,(void*)&(s->sin_addr),ip,ip_len);
            if (port) *port = ntohs(s->sin_port); /* ntohs 网络字节序的转化,一共四个转换函数 */
        } else {							// ipv6的协议族
            struct sockaddr_in6 *s = (struct sockaddr_in6 *)&sa;
            if (ip) inet_ntop(AF_INET6,(void*)&(s->sin6_addr),ip,ip_len);
            if (port) *port = ntohs(s->sin6_port);
        }
        return fd;
    }
    
    static int anetGenericAccept(char *err, int s, struct sockaddr *sa, socklen_t *len) {
        int fd;
        do {
            /* Use the accept4() call on linux to simultaneously accept and
             * set a socket as non-blocking. */
            fd = accept(s,sa,len);
        } while(fd == -1 && errno == EINTR);
        if (fd == -1) {			// accept是否成功
            anetSetError(err, "accept: %s", strerror(errno));
            return ANET_ERR;
        }
        if (anetCloexec(fd) == -1) {		// FD_CLOEXE是否被设置
            anetSetError(err, "anetCloexec: %s", strerror(errno));
            close(fd);
            return ANET_ERR;
        }
        if (anetNonBlock(err, fd) != ANET_OK) {		// 是否为非阻塞套接字
            close(fd);
            return ANET_ERR;
        }
        return fd;
    }
    
    • .listen = connSocketListen,
    /* socket.c */
    // 服务器监听端口时,应该是专用于listenfd,一般对于一个服务器而言,listenfd只有一个进程有
    // 多个进程间不能共同监听一个端口,因为端口属于操作系统的资源,不能由不同的进程共享
    // 我记得在nginx中是通过一个master创建一个listenfd,再通过fork()这个系统调用,
    // 将listenfd派生到多个子进程中,在子进程中各自对这个listenfd进行accept系统调用
    // 由于多个进程同时监听着同一个端口,所以会引发一个惊群效应,这时候我记得好像是通过master进程来做一个调节,
    // 避免惊群效应
    static int connSocketListen(connListener *listener) {
        return listenToPort(listener);
    }
    /* server.c */
    struct redisServer {
        /* network */ 	// 一个listener数组
        connListener listeners[CONN_TYPE_MAX]; /* TCP/Unix/TLS even more types */
    };
    /* server.c */
    // redis会创建多个listenfd对一个端口同时进行监听,在网络io的处理上实现了多线程,
    // 同时多个listenfd就说明了redis采用的是多reactor模型,且这些reactor是同级的
    // 都是一些很常规的调用,就直接看看,
    int listenToPort(connListener *sfd) {
        int j;
        int port = sfd->port;
        char **bindaddr = sfd->bindaddr;
    
        /* If we have no bind address, we don't listen on a TCP socket */
        if (sfd->bindaddr_count == 0) return C_OK;
    
        for (j = 0; j < sfd->bindaddr_count; j++) {
            char* addr = bindaddr[j];
            int optional = *addr == '-';
            if (optional) addr++;
            if (strchr(addr,':')) {
                /* Bind IPv6 address. */
                sfd->fd[sfd->count] = anetTcp6Server(server.neterr,port,addr,server.tcp_backlog);
            } else {
                /* Bind IPv4 address. */
                sfd->fd[sfd->count] = anetTcpServer(server.neterr,port,addr,server.tcp_backlog);
            }
            if (sfd->fd[sfd->count] == ANET_ERR) {
                int net_errno = errno;
                serverLog(LL_WARNING,
                    "Warning: Could not create server TCP listening socket %s:%d: %s",
                    addr, port, server.neterr);
                if (net_errno == EADDRNOTAVAIL && optional)
                    continue;
                if (net_errno == ENOPROTOOPT     || net_errno == EPROTONOSUPPORT ||
                    net_errno == ESOCKTNOSUPPORT || net_errno == EPFNOSUPPORT ||
                    net_errno == EAFNOSUPPORT)
                    continue;
    
                /* Rollback successful listens before exiting */
                closeListener(sfd);
                return C_ERR;
            }
            if (server.socket_mark_id > 0) anetSetSockMarkId(NULL, sfd->fd[sfd->count], server.socket_mark_id);
            anetNonBlock(NULL,sfd->fd[sfd->count]);
            anetCloexec(sfd->fd[sfd->count]);
            sfd->count++;
        }
        return C_OK;
    }
    
    /* enet.c */ // -> 封装了socket bind listen 的socket函数
    int anetTcpServer(char *err, int port, char *bindaddr, int backlog)
    {
        return _anetTcpServer(err, port, bindaddr, AF_INET, backlog);
    }
    int anetTcp6Server(char *err, int port, char *bindaddr, int backlog)
    {
        return _anetTcpServer(err, port, bindaddr, AF_INET6, backlog);
    }
    
    static int _anetTcpServer(char *err, int port, char *bindaddr, int af, int backlog)
    {
        int s = -1, rv;
        char _port[6];  /* strlen("65535") */
        struct addrinfo hints, *servinfo, *p;
    
        snprintf(_port,6,"%d",port);
        memset(&hints,0,sizeof(hints));
        hints.ai_family = af;
        hints.ai_socktype = SOCK_STREAM;
        hints.ai_flags = AI_PASSIVE;    /* No effect if bindaddr != NULL */
        if (bindaddr && !strcmp("*", bindaddr))
            bindaddr = NULL;
        if (af == AF_INET6 && bindaddr && !strcmp("::*", bindaddr))
            bindaddr = NULL;
    
        if ((rv = getaddrinfo(bindaddr,_port,&hints,&servinfo)) != 0) {
            anetSetError(err, "%s", gai_strerror(rv));
            return ANET_ERR;
        }
        for (p = servinfo; p != NULL; p = p->ai_next) {
            if ((s = socket(p->ai_family,p->ai_socktype,p->ai_protocol)) == -1)
                continue;
    
            if (af == AF_INET6 && anetV6Only(err,s) == ANET_ERR) goto error;
            // 设置SO_REUSERADDR,确保redis能够频繁的打开和关闭套接字
            if (anetSetReuseAddr(err,s) == ANET_ERR) goto error;
            // 调用bind和listen
            if (anetListen(err,s,p->ai_addr,p->ai_addrlen,backlog,0) == ANET_ERR) s = ANET_ERR;
            goto end;
        }
        if (p == NULL) {
            anetSetError(err, "unable to bind socket, errno: %d", errno);
            goto error;
        }
    error:
        if (s != -1) close(s);
        s = ANET_ERR;
    end:
        freeaddrinfo(servinfo);
        return s;
    }
    
    static int anetListen(char *err, int s, struct sockaddr *sa, socklen_t len, int backlog, mode_t perm) {
        // 调用bind将listenfd和对应的端口进行绑定
        if (bind(s,sa,len) == -1) {
            anetSetError(err, "bind: %s", strerror(errno));
            close(s);
            return ANET_ERR;
        }
    
        if (sa->sa_family == AF_LOCAL && perm)
            chmod(((struct sockaddr_un *) sa)->sun_path, perm);
    	// 调用listen端口进行监听,backing是全连接队列的大小
        // 全连接队列:客户端调用connect,服务端建立了连接但是并系统并没有为这个连接分配对应的
        // socket fd
        if (listen(s, backlog) == -1) {
            anetSetError(err, "listen: %s", strerror(errno));
            close(s);
            return ANET_ERR;
        }
        return ANET_OK;
    }
    

    SO_REUSERADDR的用法

    该套接字选项能够起到四种不同的作用

    • 允许启动一个监听服务器并捆绑其众所周知端口,即使以前建立的将该端口用作它们的本地端口的连接仍存在。常用于重启一个服务器。
    • 允许在同一个端口上启动同一个服务器的多个实例,只要每个实例捆绑一个不同的本地IP地址即可。常用于实现IP别名技术。如通过使用ip别名技术来托管多个http服务器的网点(site)。
    • 允许同一个进程在同一个端口绑定到多个套接字上,只要每次捆绑指定不同的本地ip地址即可。
    • 允许完全重复捆绑。
  • .close = connSocketClose,

    /* Close the connection and free resources. */
    static void connSocketClose(connection *conn) {
        if (conn->fd != -1) {
            aeDeleteFileEvent(server.el,conn->fd, AE_READABLE | AE_WRITABLE);
            close(conn->fd);
            conn->fd = -1;
        }
        /* If called from within a handler, schedule the close but
         * keep the connection until the handler returns.
         */
        if (connHasRefs(conn)) {
            conn->flags |= CONN_FLAG_CLOSE_SCHEDULED;
            return;
        }
        zfree(conn);
    }
    
  • .connect = connSocketConnect,

    这个对connet()的封装,主要用于客户端

    static int connSocketConnect(connection *conn, const char *addr, int port, const char *src_addr,
            ConnectionCallbackFunc connect_handler) {
        int fd = anetTcpNonBlockBestEffortBindConnect(NULL,addr,port,src_addr);
        if (fd == -1) {
            conn->state = CONN_STATE_ERROR;
            conn->last_errno = errno;
            return C_ERR;
        }
    
        conn->fd = fd;
        conn->state = CONN_STATE_CONNECTING;
    
        conn->conn_handler = connect_handler;
        // 添加一个文件事件
        aeCreateFileEvent(server.el, conn->fd, AE_WRITABLE,
                conn->type->ae_handler, conn);
    
        return C_OK;
    }
    /* anet.c */
    int anetTcpNonBlockBestEffortBindConnect(char *err, const char *addr, int port,
                                             const char *source_addr)
    {
        return anetTcpGenericConnect(err,addr,port,source_addr,
                ANET_CONNECT_NONBLOCK|ANET_CONNECT_BE_BINDING);
    }
    static int anetTcpGenericConnect(char *err, const char *addr, int port,
                                     const char *source_addr, int flags)
    {
        int s = ANET_ERR, rv;
        char portstr[6];  /* strlen("65535") + 1; */
        struct addrinfo hints, *servinfo, *bservinfo, *p, *b;
    
        snprintf(portstr,sizeof(portstr),"%d",port);
        memset(&hints,0,sizeof(hints));
        hints.ai_family = AF_UNSPEC;
        hints.ai_socktype = SOCK_STREAM;
    
        if ((rv = getaddrinfo(addr,portstr,&hints,&servinfo)) != 0) {
            anetSetError(err, "%s", gai_strerror(rv));
            return ANET_ERR;
        }
        for (p = servinfo; p != NULL; p = p->ai_next) {
            /* Try to create the socket and to connect it.
             * If we fail in the socket() call, or on connect(), we retry with
             * the next entry in servinfo. */
            if ((s = socket(p->ai_family,p->ai_socktype,p->ai_protocol)) == -1)
                continue;
            if (anetSetReuseAddr(err,s) == ANET_ERR) goto error;
            if (flags & ANET_CONNECT_NONBLOCK && anetNonBlock(err,s) != ANET_OK)
                goto error;
            if (source_addr) {
                int bound = 0;
                /* Using getaddrinfo saves us from self-determining IPv4 vs IPv6 */
                if ((rv = getaddrinfo(source_addr, NULL, &hints, &bservinfo)) != 0)
                {
                    anetSetError(err, "%s", gai_strerror(rv));
                    goto error;
                }
                for (b = bservinfo; b != NULL; b = b->ai_next) {
                    if (bind(s,b->ai_addr,b->ai_addrlen) != -1) {
                        bound = 1;
                        break;
                    }
                }
                freeaddrinfo(bservinfo);
                if (!bound) {
                    anetSetError(err, "bind: %s", strerror(errno));
                    goto error;
                }
            }
            if (connect(s,p->ai_addr,p->ai_addrlen) == -1) {
                /* If the socket is non-blocking, it is ok for connect() to
                 * return an EINPROGRESS error here. */
                if (errno == EINPROGRESS && flags & ANET_CONNECT_NONBLOCK)
                    goto end;
                close(s);
                s = ANET_ERR;
                continue;
            }
    
            /* If we ended an iteration of the for loop without errors, we
             * have a connected socket. Let's return to the caller. */
            goto end;
        }
        if (p == NULL)
            anetSetError(err, "creating socket: %s", strerror(errno));
    
    error:
        if (s != ANET_ERR) {
            close(s);
            s = ANET_ERR;
        }
    
    end:
        freeaddrinfo(servinfo);
    
        /* Handle best effort binding: if a binding address was used, but it is
         * not possible to create a socket, try again without a binding address. */
        // 如果使用了绑定地址,但无法创建套接字,则在没有绑定地址的情况下再试一次
        // 这个看的我很迷,因为在客户端一般是不需要绑定端口的,一般是直接使用随机的端口进行connect
        // 这里的翻译可能有点错误,因为参数source_addr是用于将套接字进行绑定的,
        if (s == ANET_ERR && source_addr && (flags & ANET_CONNECT_BE_BINDING)) {
            return anetTcpGenericConnect(err,addr,port,NULL,flags);
        } else {
            return s;
        }
    }
    
    

**到这里socket的创建基本就已经结束了,redis的特别之处还有一个是他的引用计数机制。**之后再把全部串起来看看,redis的引用机制是如何实现的。

  • 38
    点赞
  • 44
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值