memcached之网络模型详解

最新推荐文章于 2024-06-23 13:29:27 发布

有时需要偏执狂

最新推荐文章于 2024-06-23 13:29:27 发布

阅读量1k

点赞数

分类专栏：数据库 MySQL AND NoSQL

本文链接：https://blog.csdn.net/u010710458/article/details/80246630

版权

数据库同时被 2 个专栏收录

21 篇文章 4 订阅

订阅专栏

MySQL AND NoSQL

9 篇文章 0 订阅

订阅专栏

综述

主线程创建多个子线程(这些子线程也称为worker线程)，每一个线程都维持自己的事件循环，即每个线程都有自己的epoll，并且都会调用epoll_wait函数进入事件监听状态。每一个worker线程(子线程)和主线程之间都用一条管道相互通信。每一个子线程都监听自己对应那条管道的读端。当主线程想和某一个worker线程进行通信，直接往对应的那条管道写入数据即可。
模型的工作流程：主线程负责监听进程对外的TCP监听端口。当客户端申请连接connect到进程的时候，主线程负责接收accept客户端的连接请求。然后主线程选择其中一个worker线程，把客户端fd通过对应的管道传给worker线程。worker线程得到客户端的fd后负责和这个客户端进行一切的通信。以下是一个经典的模型。
这里写图片描述

主线程通过epoll监听创建的监听套接字，然后有连接accept，并将对应的fd封装成CQ_ITEM，通过轮询选定一个线程，然后给管道写，然后将其压入选定线程的连接队列中并通过子线程的epoll监听这个fd。完全就是上述过程。

memcached实现上述网络模型

主线程

首先看看main函数主进程大体过程：

int main (int argc, char **argv) {
   ........//一些变量省略
    if (!sanitycheck()) {//检查Libevent版本
        return EX_OSERR;
    }
    /* handle SIGINT */
    signal(SIGINT, sig_handler);//注册中断信号，给进程发送SIGINT信号直接退出。

    /* init settings */
    settings_init();//初始化一些设置

    /*
        通过getopt 处理命令参数
        ....................
    */
    if (hash_init(hash_type) != 0) {//初始化hash表
        fprintf(stderr, "Failed to initialize hash_algorithm!\n");
        exit(EX_USAGE);
    }

    /* initialize main thread libevent instance */
    main_base = event_init();//初始化Libevent的main_base，这个是全局的

    /* initialize other stuff */
    stats_init();
    assoc_init(settings.hashpower_init);
    conn_init();//连接队列初始化
    slabs_init(settings.maxbytes, settings.factor, preallocate);//slabs初始化
       /* start up worker threads if MT mode */
    thread_init(settings.num_threads, main_base);//初始化工作线程，主线程先创建工作线程。
       /* initialise clock event */
    clock_handler(0, 0, 0);
/* create the listening socket, bind it, and init  在这里面主线程通过epoll监听listen fd*/
    if (settings.socketpath == NULL) {
        const char *portnumber_filename = getenv("MEMCACHED_PORT_FILENAME");
        char temp_portnumber_filename[PATH_MAX];
        FILE *portnumber_file = NULL;

        if (portnumber_filename != NULL) {
            snprintf(temp_portnumber_filename,
                     sizeof(temp_portnumber_filename),
                     "%s.lck", portnumber_filename);

            portnumber_file = fopen(temp_portnumber_filename, "a");
            if (portnumber_file == NULL) {
                fprintf(stderr, "Failed to open \"%s\": %s\n",
                        temp_portnumber_filename, strerror(errno));
            }
        }

        errno = 0;
        if (settings.port && server_sockets(settings.port, tcp_transport,
                                           portnumber_file)) {
            vperror("failed to listen on TCP port %d", settings.port);
            exit(EX_OSERR);
        }

        /*
         * initialization order: first create the listening sockets
         * (may need root on low ports), then drop root if needed,
         * then daemonise if needed, then init libevent (in some cases
         * descriptors created by libevent wouldn't survive forking).
         */

        /* create the UDP listening socket and bind it */
        errno = 0;
        if (settings.udpport && server_sockets(settings.udpport, udp_transport,
                                              portnumber_file)) {
            vperror("failed to listen on UDP port %d", settings.udpport);
            exit(EX_OSERR);
        }

        if (portnumber_file) {
            fclose(portnumber_file);
            rename(temp_portnumber_filename, portnumber_filename);
        }
    }

    /* enter the event loop  主线程进入epoll循环，持续监听外部连接*/
    if (event_base_loop(main_base, 0) != 0) {
        retval = EXIT_FAILURE;
    }//进入主线程的epoll循环
}

上述已经将主线程做的事情都列举出来了，下面是main函数调用图，重点讲解图中红色框框里面的函数。
这里写图片描述

注意上述主线程event_base初始化，已经server_sockets函数调用创建监听fd事件。下面深入分析主线程是如何创建listen fd事件的。

//strdup()：
//功 能: 将串拷贝到新建的位置处
//strdup()在内部调用了malloc()为变量分配内存，不需要使用返回的字符串时，需要用free()释放相应的内存空间，否则会造成内存泄漏。


//port是默认的11211或者用户使用-p选项设置的端口号  
//主线程在main函数会调用本函数  
static int server_sockets(int port, enum network_transport transport,  
                          FILE *portnumber_file) {  

    //settings.inter里面可能有多个IP地址.如果有多个那么将用逗号分隔  
    char *b;  
    int ret = 0;  
    //复制一个字符串，内部使用了malloc，避免下面的strtok_r函数修改(污染)全局变量settings.inter。  
    char *list = strdup(settings.inter);  

    //这个循环主要是处理多个IP的情况  
    for (char *p = strtok_r(list, ";,", &b);  
         p != NULL; //分割出一个个的ip,使用分号;作为分隔符  
         p = strtok_r(NULL, ";,", &b)) {  
        int the_port = port;  
        char *s = strchr(p, ':');//启动的可能使用-l ip:port 参数形式  
        //ip后面接着端口号，即指定ip的同时也指定了该ip的端口号  
        //此时采用ip后面的端口号，而不是采用-p指定的端口号  
        if (s != NULL) {  
            *s = '\0';//截断后面的端口号，使得p指向的字符串只是一个ip  
            ++s;  
            if (!safe_strtol(s, &the_port)) {//非法端口号参数值  
                return 1;  
            }  
        }  
        if (strcmp(p, "*") == 0) {  
            p = NULL;  
        }  
        //处理其中一个IP。有p指定ip(或者hostname)  
        ret |= server_socket(p, the_port, transport, portnumber_file);  
    }  
    free(list);//释放临时内存  
    return ret;  
}  


static conn *listen_conn = NULL;//监听队列(可能要同时监听多个IP)  


 //interface是一个ip、hostname或者NULL。这个ip字符串后面没有端口号。端口号由参数port指出  
static int server_socket(const char *interface,  
                         int port,  
                         enum network_transport transport,  
                         FILE *portnumber_file) {  
    int sfd;  
    struct linger ling = {0, 0};  
    struct addrinfo *ai;  
    struct addrinfo *next;  
    struct addrinfo hints = { .ai_flags = AI_PASSIVE,  
                              .ai_family = AF_UNSPEC };  
    char port_buf[NI_MAXSERV];  
    int success = 0;  
    int flags =1;  

    hints.ai_socktype = IS_UDP(transport) ? SOCK_DGRAM : SOCK_STREAM;  


    snprintf(port_buf, sizeof(port_buf), "%d", port);  
    getaddrinfo(interface, port_buf, &hints, &ai);  

    //如果interface是一个hostname的话，那么可能就有多个ip  
    for (next= ai; next; next= next->ai_next) {  
        conn *listen_conn_add;  

        //创建一个套接字，然后设置为非阻塞的  
        sfd = new_socket(next);//调用socket函数  
        bind(sfd, next->ai_addr, next->ai_addrlen);  

        success++;  
        listen(sfd, settings.backlog);  


        if (!(listen_conn_add = conn_new(sfd, conn_listening,  
                                         EV_READ | EV_PERSIST, 1,  
                                         transport, main_base))) {  
            fprintf(stderr, "failed to create listening connection\n");  
            exit(EXIT_FAILURE);  
        }  

        //将要监听的多个conn放到一个监听队列里面  
        listen_conn_add->next = listen_conn;  
        listen_conn = listen_conn_add;  

    }  

    freeaddrinfo(ai);  

    /* Return zero iff we detected no errors in starting up connections */  
    return success == 0;  
}  


static int new_socket(struct addrinfo *ai) {  
    int sfd;  
    int flags;  
    sfd = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);  
    flags = fcntl(sfd, F_GETFL, 0);  
    fcntl(sfd, F_SETFL, flags | O_NONBLOCK);  

    return sfd;  
}

根据用户的IP和端口号建立一个socket，bind、listen监听客户端的到来。因为主线程申请的socket fd已经设置为非阻塞的，所以listen函数会立刻返回。在main函数中，主线程最终将调用event_base_loop函数进入事件监听循环，处理客户端的连接请求。注意上述
conn_new(sfd, conn_listening, EV_READ | EV_PERSIST, 1, transport, main_base))这个调用。
事件类型是EV_READ | EV_PERSIST。
事件初始化状态是conn_listening正在监听。
当主线程监听套接字就绪，会调用event_handler函数进入状态机进行处理。后续会仔细讲解event_handler。

work线程

1、连接队列
每个子进程都含有一个连接队列，用于保存每个网络连接需要的一些数据。连接队列通过链表实现，非常简单。

/* An item in the connection queue. */
typedef struct conn_queue_item CQ_ITEM;
struct conn_queue_item {
    int               sfd;//连接的socket fd
    enum conn_states  init_state;//连接的状态，这个状态非常重要，状态不同对应的处理也不同
    int               event_flags;//监听事件标志（读或者写）
    int               read_buffer_size;//
    enum network_transport     transport;//
    CQ_ITEM          *next;//指向下一个连接
};

/* A connection queue. */
typedef struct conn_queue CQ;//一个连接队列
struct conn_queue {
    CQ_ITEM *head;//指向队列的第一个节点。
    CQ_ITEM *tail;//指向队列的最后一个节点。
    pthread_mutex_t lock;//一个队列互斥量，用于线程间同步。
};

因为多线程，所以主线程往某个worker线程的CQ队列里面push一个CQ_ITEM的时候必然要加锁。

/*
 * Initializes a connection queue. 初始化连接队列头
 */
static void cq_init(CQ *cq) {
    pthread_mutex_init(&cq->lock, NULL);
    cq->head = NULL;
    cq->tail = NULL;
}

/*
 * Adds an item to a connection queue. 添加到队列尾部，链表插入很简单
 */
static void cq_push(CQ *cq, CQ_ITEM *item) {
    item->next = NULL;

    pthread_mutex_lock(&cq->lock);//必须上锁
    if (NULL == cq->tail)
        cq->head = item;
    else
        cq->tail->next = item;
    cq->tail = item;
    pthread_mutex_unlock(&cq->lock);//必须上锁
}

/*
 * 
 * 
 * Returns the item, or NULL if no item is available 返回链表头部节点
 */
static CQ_ITEM *cq_pop(CQ *cq) {
    CQ_ITEM *item;

    pthread_mutex_lock(&cq->lock);
    item = cq->head;
    if (NULL != item) {
        cq->head = item->next;
        if (NULL == cq->head)
            cq->tail = NULL;
    }
    pthread_mutex_unlock(&cq->lock);

    return item;
}

这个链表实现起来比较简单，最后形成的一个连接队列如下图：
这里写图片描述

2、线程结构体
从上述网络模型图可以得知，每一个工作线程里面都含有一个连接队列，让此线程可以处理多个连接，并且线程需要监听管道的读、写事件等等，于是memcached为每一个线程创建一个结构体变量，将上述线程需要的资料包装起来。

typedef struct {
    pthread_t thread_id;        /* 线程ID*/
    struct event_base *base;    /* 单个线程使用event_base */
    struct event notify_event;  /* 线程监听管道读事件的event */
    int notify_receive_fd;      /* 管道的读fd */
    int notify_send_fd;         /* 管道的写fd */
    struct thread_stats stats;  /* 线程状态*/
    struct conn_queue *new_conn_queue; /* 连接队列的头（指向连接队列） */
    cache_t *suffix_cache;      /* suffix cache */
    uint8_t item_lock_type;     /* use fine-grained or global item lock */
} LIBEVENT_THREAD;//传入pthread_create的结构体。

工作线程的数量settings.num_threads，通过命令行参数配置。并定义一个全局的线程结构体数组static LIBEVENT_THREAD *threads;，因为全局所以主线程和子线程都可以访问，当压入连接队列的时候就非常简单了。


static LIBEVENT_THREAD *threads;//全局，主和子线程都可以访问，这点对于将CQ_Item压入子线程对应的链接队列很重要。

/*
多线程初始化，创建线程，并确定线程中的回调函数
nthreads通过命令行设定。
*/
void thread_init(int nthreads, struct event_base *main_base) { 
        .....

    //申请一个CQ_ITEM时需要加锁，后面会介绍  
    pthread_mutex_init(&cqi_freelist_lock, NULL);  
    cqi_freelist = NULL;  
    //仅仅列出与线程有关的初始化
    threads = calloc(nthreads, sizeof(LIBEVENT_THREAD));//分配线程对应的结构体
    if (! threads) {
        perror("Can't allocate thread descriptors");
        exit(1);
    }

    ......

    for (i = 0; i < nthreads; i++) {//为每个线程创建管道，并设定结构体成员
        int fds[2];
        if (pipe(fds)) {
            perror("Can't create notify pipe");
            exit(1);
        }

        threads[i].notify_receive_fd = fds[0];
        threads[i].notify_send_fd = fds[1];

        //每一个线程配一个event_base,并设置event监听notify_receive_fd的读事件  
        //同时还为这个线程分配一个conn_queue队列  
        setup_thread(&threads[i]);
        /* Reserve three fds for the libevent base, and two for the pipe */
        stats.reserved_fds += 5;
    }

    /* Create threads after we've done all the libevent setup. */
    for (i = 0; i < nthreads; i++) {//相当重要
    //创建线程，线程函数为worker_libevent, 线程参数为&threads[i] 
        create_worker(worker_libevent, &threads[i]);
    }
}


/*
 * 设定线程结构体成员变量
 */
static void setup_thread(LIBEVENT_THREAD *me) {
    me->base = event_init();//线程初始化event_base
    if (! me->base) {
        fprintf(stderr, "Can't allocate event base\n");
        exit(1);
    }

    /* Listen for notifications from other threads */
    event_set(&me->notify_event, me->notify_receive_fd,
              EV_READ | EV_PERSIST, thread_libevent_process, me);//创建监听管道读事件，注意回调函数是thread_libevent_process，传入参数是LIBEVENT_THREAD 

    event_base_set(me->base, &me->notify_event);//将事件和当前线程event_base绑定

    if (event_add(&me->notify_event, 0) == -1) {//假如到epoll监听
        fprintf(stderr, "Can't monitor libevent notify pipe\n");
        exit(1);
    }

    me->new_conn_queue = malloc(sizeof(struct conn_queue));//连接队列头分配
    if (me->new_conn_queue == NULL) {
        perror("Failed to allocate memory for connection queue");
        exit(EXIT_FAILURE);
    }
    cq_init(me->new_conn_queue);//队列头初始化

    if (pthread_mutex_init(&me->stats.mutex, NULL) != 0) {
        perror("Failed to initialize mutex");
        exit(EXIT_FAILURE);
    }

    me->suffix_cache = cache_create("suffix", SUFFIX_SIZE, sizeof(char*),
                                    NULL, NULL);
    if (me->suffix_cache == NULL) {
        fprintf(stderr, "Failed to create suffix cache\n");
        exit(EXIT_FAILURE);
    }
}




/*
 * 前面初始化结构体参数，现在就是创建线程了，回调函数是worker_libevent，传入参数是LIBEVENT_THREAD线程参数
 */
static void create_worker(void *(*func)(void *), void *arg) {
    pthread_t       thread;
    pthread_attr_t  attr;
    int             ret;

    pthread_attr_init(&attr);

    if ((ret = pthread_create(&thread, &attr, func, arg)) != 0) {
        fprintf(stderr, "Can't create thread: %s\n",
                strerror(ret));
        exit(1);
    }
}

通过上述我们必须注意两个回调函数
其一，每个线程监听管道可读的回调函数thread_libevent_process，传入参数是LIBEVENT_THREAD，这个回调函数处理主线程通过管道给子线性发生的信号。
其二，每个线程运行的函数为worker_libevent，传入参数是LIBEVENT_THREAD，线程处理所以连接全部在此函数执行。
下面仔细看看这个两个函数，这就是编写多线程程序的典型模范，可以好好学习，其是如何处理多线程的。


/*
 * 每个子线程的event_base循环
 */
static void *worker_libevent(void *arg) {
    LIBEVENT_THREAD *me = arg;

    /* Any per-thread setup can happen here; thread_init() will block until
     * all threads have finished initializing.
     */

    /* set an indexable thread-specific memory item for the lock type.
     * this could be unnecessary if we pass the conn *c struct through
     * all item_lock calls...
     */
    me->item_lock_type = ITEM_LOCK_GRANULAR;
    pthread_setspecific(item_lock_type_key, &me->item_lock_type);

    register_thread_initialized();

    event_base_loop(me->base, 0);//子线程进入监听循环中，等待主线程写管道。
    return NULL;
}

子线程主函数啥有没有干，就是一直进入监听循环。首先监听管道的读，然后建立主线程传进来的外部连接并将套接字假如到epoll，继续进入监听—处理—-监听循环。



/*
当子线程管道可读，此函数被调用
*/
static void thread_libevent_process(int fd, short which, void *arg) {
    LIBEVENT_THREAD *me = arg;
    CQ_ITEM *item;
    char buf[1];

    if (read(fd, buf, 1) != 1)//读取主线程写入管道的数据
        if (settings.verbose > 0)
            fprintf(stderr, "Can't read from libevent pipe\n");

    switch (buf[0]) {//主线程
    case 'c':
    item = cq_pop(me->new_conn_queue);//新连接fd已经由主线程压入连接队列，在这里取出。然后子线程，epoll这个连接。cq_pop将删除conn_queue.

    if (NULL != item) {
    //将sfd加入到当前子线性的epoll。
        conn *c = conn_new(item->sfd, item->init_state, item->event_flags,
                           item->read_buffer_size, item->transport, me->base);
        if (c == NULL) {
            if (IS_UDP(item->transport)) {
                fprintf(stderr, "Can't listen for events on UDP socket\n");
                exit(1);
            } else {
                if (settings.verbose > 0) {
                    fprintf(stderr, "Can't listen for events on fd %d\n",
                        item->sfd);
                }
                close(item->sfd);
            }
        } else {
            c->thread = me;
        }
        cqi_free(item);
    }
        break;
    /* we were told to flip the lock type and report in */
    case 'l':
    me->item_lock_type = ITEM_LOCK_GRANULAR;
    register_thread_initialized();
        break;
    case 'g':
    me->item_lock_type = ITEM_LOCK_GLOBAL;
    register_thread_initialized();
        break;
    }
}

从上面可知，主线程接收到连接后，将连接的fd压入到选定的子线程的连接队列。然后给这个子进程管道写入相应的字符，告诉子进程应该如何处理这个连接。
并且每个子进程new_conn_queue队列里面存放着当前进程未处理（没有epoll）的连接。
注意上述的conn_new新键了一个连接，然后将其加入到子线程epoll进行监听读。
下面仔细讲解conn_new。

3、连接结构体
memcached将每个连接都封装成一个结构体struct conn。并且子线程对创建连接fd的event，并将其加入到自己的event_base中进行监听。
事实上memcached为每一个socket fd(也就是一个连接)都创建一个conn结构体，用于管理这个socket fd(连接)。因为一个连接会有很多数据和状态信息，所以需要一个结构体来负责管理。
在启动memcached之后就可以确定最多允许多少个客户端同时在线（所以子进程里面的连接总和同时在线）了。可以通过命令行参数设定，默认是1024，也就是最多同时处理1024个连接。有了这个数值那么可以在一开始(conn_init函数)，就动态申请一个数组。有新连接就从这个数组中分配一个元素即可，可以有效避免内存碎片。

/**
 * The structure representing a connection into memcached.
 */
typedef struct conn conn;
struct conn {
    int    sfd;
    sasl_conn_t *sasl_conn;
    bool authenticated;
    enum conn_states  state;
    enum bin_substates substate;
    rel_time_t last_cmd_time;
    struct event event;
    short  ev_flags;
    short  which;   /** which events were just triggered */

    char   *rbuf;   /** buffer to read commands into */
    char   *rcurr;  /** but if we parsed some already, this is where we stopped */
    int    rsize;   /** total allocated size of rbuf */
    int    rbytes;  /** how much data, starting from rcur, do we have unparsed */

    char   *wbuf;
    char   *wcurr;
    int    wsize;
    int    wbytes;
    /** which state to go into after finishing current write */
    enum conn_states  write_and_go;
    void   *write_and_free; /** free this memory after finishing writing */

    char   *ritem;  /** when we read in an item's value, it goes here */
    int    rlbytes;

    /* data for the nread state */

    /**
     * item is used to hold an item structure created after reading the command
     * line of set/add/replace commands, but before we finished reading the actual
     * data. The data is read into ITEM_data(item) to avoid extra copying.
     */

    void   *item;     /* for commands set/add/replace  */

    /* data for the swallow state */
    int    sbytes;    /* how many bytes to swallow */

    /* data for the mwrite state */
    struct iovec *iov;
    int    iovsize;   /* number of elements allocated in iov[] */
    int    iovused;   /* number of elements used in iov[] */

    struct msghdr *msglist;
    int    msgsize;   /* number of elements allocated in msglist[] */
    int    msgused;   /* number of elements used in msglist[] */
    int    msgcurr;   /* element in msglist[] being transmitted now */
    int    msgbytes;  /* number of bytes in current msg */

    item   **ilist;   /* list of items to write out */
    int    isize;
    item   **icurr;
    int    ileft;

    char   **suffixlist;
    int    suffixsize;
    char   **suffixcurr;
    int    suffixleft;

    enum protocol protocol;   /* which protocol this connection speaks */
    enum network_transport transport; /* what transport is used by this connection */

    /* data for UDP clients */
    int    request_id; /* Incoming UDP request ID, if this is a UDP "connection" */
    struct sockaddr_in6 request_addr; /* udp: Who sent the most recent request */
    socklen_t request_addr_size;
    unsigned char *hdrbuf; /* udp packet headers */
    int    hdrsize;   /* number of headers' worth of space is allocated */

    bool   noreply;   /* True if the reply should not be sent. */
    /* current stats command */
    struct {
        char *buffer;
        size_t size;
        size_t offset;
    } stats;

    /* Binary protocol stuff */
    /* This is where the binary header goes */
    protocol_binary_request_header binary_header;
    uint64_t cas; /* the cas to return */
    short cmd; /* current command being processed */
    int opaque;
    int keylen;
    conn   *next;     /* Used for generating a list of conn structures */
    LIBEVENT_THREAD *thread; /* Pointer to the thread object serving this connection */
};

conn **conns;//指针的指针，conn[i]执向malloc(conn)。这样既不会内存碎片，也
/*

 */
static void conn_init(void) {
    /* We're unlikely to see an FD much higher than maxconns. */
    int next_fd = dup(1);//获取当前已经使用的fd的个数
    int headroom = 10;      /* 预留一些文件描述符   */
    struct rlimit rl;
    //settings.maxconns的默认值是1024. 
    max_fds = settings.maxconns + headroom + next_fd;

    /* But if possible, get the actual highest FD we can possibly ever see. */
    if (getrlimit(RLIMIT_NOFILE, &rl) == 0) {
        max_fds = rl.rlim_max;
    } else {
        fprintf(stderr, "Failed to query maximum file descriptor; "
                        "falling back to maxconns\n");
    }

    close(next_fd);
    //注意，申请的conn结构体数量是比settings.maxconns这个客户端同时在线数  
    //还要大的。因为memcached是直接用socket fd的值作为数组下标的。也正是  
    //这个原因，前面需要使用headroom预留一些空间给突发情况。  
    if ((conns = calloc(max_fds, sizeof(conn *))) == NULL) {
        fprintf(stderr, "Failed to allocate connection structures\n");
        /* This is unrecoverable so bail out early. */
        exit(1);
    }
}

内存模型如下：
这里写图片描述

//为sfd分配一个conn结构体，并且为这个sfd建立一个event，然后让base监听这个event  
conn *conn_new(const int sfd, enum conn_states init_state,//init_state值为conn_listening  
                const int event_flags,  
                const int read_buffer_size, enum network_transport transport,  
                struct event_base *base) {  
    conn *c;  

    assert(sfd >= 0 && sfd < max_fds);  
    c = conns[sfd];//直接使用下标，位置已经留好了。  

    if (NULL == c) {//之前没有哪个连接用过这个sfd值，需要申请一个conn结构体  
        if (!(c = (conn *)calloc(1, sizeof(conn)))) {  
            fprintf(stderr, "Failed to allocate connection object\n");  
            return NULL;  
        }  

        ...//初始化一些成员变量  

        c->sfd = sfd;  
        conns[sfd] = c; //将这个结构体交由conns数组管理  
    }  

    ...//初始化另外一些成员变量  
    c->state = init_state;//值为conn_listening，  

    //等同于event_assign。event的回调函数是event_handler  
    event_set(&c->event, sfd, event_flags, event_handler, (void *)c);  
    event_base_set(base, &c->event);  
    c->ev_flags = event_flags;  

    if (event_add(&c->event, 0) == -1) {//添加到event_base  
        perror("event_add");  
        return NULL;  
    }    
    return c;//返回连接指针  
}

这里最需要注意注册的回调函数，事件类型以及传入的参数：
event_flags表示事件类型，这个有主线程设定cq队列成员变量。
event_handler是已建立连接的事件就绪调用的回调函数。
conn是此回调函数传入的参数。

事件回调函数event_handler

event_handler处理了监听和连接两种事件。

void event_handler(const int fd, const short which, void *arg) {
    conn *c;

    c = (conn *)arg;
    assert(c != NULL);

    c->which = which;//传入触发事件的类型

    /* sanity */
    if (fd != c->sfd) {//fd不正确，则错误。
        if (settings.verbose > 0)
            fprintf(stderr, "Catastrophic: event fd doesn't match conn fd!\n");
        conn_close(c);
        return;
    }

    drive_machine(c);//状态机处理，下面会详细讲解状态机的操作

    /* wait for next event */
    return;
}

这是一个所有已连接套接字可读共同调用的回调函数，看起来很简单，但是里面通过一个状态机处理每个连接的套接字。下面仔细分析分析这个状态机。

/**
 * 首先每一个连接有12个状态。所以
 */
enum conn_states {
    conn_listening,  /**< the socket which listens for connections */
    conn_new_cmd,    /**< Prepare connection for next command */
    conn_waiting,    /**< waiting for a readable socket */
    conn_read,       /**< reading in a command line */
    conn_parse_cmd,  /**< try to parse a command from the input buffer */
    conn_write,      /**< writing out a simple response */
    conn_nread,      /**< reading in a fixed number of bytes */
    conn_swallow,    /**< swallowing unnecessary bytes w/o storing */
    conn_closing,    /**< closing this connection */
    conn_mwrite,     /**< writing out many items sequentially */
    conn_closed,     /**< connection is closed */
    conn_max_state   /**< Max state value (used for assertion) */
};

        switch(c->state) {
        case conn_listening:
            addrlen = sizeof(addr);
#ifdef HAVE_ACCEPT4
            if (use_accept4) {
                sfd = accept4(c->sfd, (struct sockaddr *)&addr, &addrlen, SOCK_NONBLOCK);
            } else {
                sfd = accept(c->sfd, (struct sockaddr *)&addr, &addrlen);
            }
#else
            sfd = accept(c->sfd, (struct sockaddr *)&addr, &addrlen);
#endif
            if (sfd == -1) {
                if (use_accept4 && errno == ENOSYS) {
                    use_accept4 = 0;
                    continue;
                }
                perror(use_accept4 ? "accept4()" : "accept()");
                if (errno == EAGAIN || errno == EWOULDBLOCK) {
                    /* these are transient, so don't log anything */
                    stop = true;
                } else if (errno == EMFILE) {
                    if (settings.verbose > 0)
                        fprintf(stderr, "Too many open connections\n");
                    accept_new_conns(false);
                    stop = true;
                } else {
                    perror("accept()");
                    stop = true;
                }
                break;
            }
            if (!use_accept4) {
                if (fcntl(sfd, F_SETFL, fcntl(sfd, F_GETFL) | O_NONBLOCK) < 0) {
                    perror("setting O_NONBLOCK");
                    close(sfd);
                    break;
                }
            }

            if (settings.maxconns_fast &&
                stats.curr_conns + stats.reserved_fds >= settings.maxconns - 1) {
                str = "ERROR Too many open connections\r\n";
                res = write(sfd, str, strlen(str));
                close(sfd);
                STATS_LOCK();
                stats.rejected_conns++;
                STATS_UNLOCK();
            } else {
                dispatch_conn_new(sfd, conn_new_cmd, EV_READ | EV_PERSIST,
                                     DATA_BUFFER_SIZE, tcp_transport);
            }

            stop = true;
            break;

        case conn_waiting:
            if (!update_event(c, EV_READ | EV_PERSIST)) {
                if (settings.verbose > 0)
                    fprintf(stderr, "Couldn't update event\n");
                conn_set_state(c, conn_closing);
                break;
            }

            conn_set_state(c, conn_read);
            stop = true;
            break;

        case conn_read:
            res = IS_UDP(c->transport) ? try_read_udp(c) : try_read_network(c);

            switch (res) {
            case READ_NO_DATA_RECEIVED:
                conn_set_state(c, conn_waiting);
                break;
            case READ_DATA_RECEIVED:
                conn_set_state(c, conn_parse_cmd);
                break;
            case READ_ERROR:
                conn_set_state(c, conn_closing);
                break;
            case READ_MEMORY_ERROR: /* Failed to allocate more memory */
                /* State already set by try_read_network */
                break;
            }
            break;

        case conn_parse_cmd :
            if (try_read_command(c) == 0) {
                /* wee need more data! */
                conn_set_state(c, conn_waiting);
            }

            break;

        case conn_new_cmd:
            /* Only process nreqs at a time to avoid starving other
               connections */

            --nreqs;
            if (nreqs >= 0) {
                reset_cmd_handler(c);
            } else {
                pthread_mutex_lock(&c->thread->stats.mutex);
                c->thread->stats.conn_yields++;
                pthread_mutex_unlock(&c->thread->stats.mutex);
                if (c->rbytes > 0) {
                    /* We have already read in data into the input buffer,
                       so libevent will most likely not signal read events
                       on the socket (unless more data is available. As a
                       hack we should just put in a request to write data,
                       because that should be possible ;-)
                    */
                    if (!update_event(c, EV_WRITE | EV_PERSIST)) {
                        if (settings.verbose > 0)
                            fprintf(stderr, "Couldn't update event\n");
                        conn_set_state(c, conn_closing);
                        break;
                    }
                }
                stop = true;
            }
            break;

        case conn_nread:
            if (c->rlbytes == 0) {
                complete_nread(c);
                break;
            }

            /* Check if rbytes < 0, to prevent crash */
            if (c->rlbytes < 0) {
                if (settings.verbose) {
                    fprintf(stderr, "Invalid rlbytes to read: len %d\n", c->rlbytes);
                }
                conn_set_state(c, conn_closing);
                break;
            }

            /* first check if we have leftovers in the conn_read buffer */
            if (c->rbytes > 0) {
                int tocopy = c->rbytes > c->rlbytes ? c->rlbytes : c->rbytes;
                if (c->ritem != c->rcurr) {
                    memmove(c->ritem, c->rcurr, tocopy);
                }
                c->ritem += tocopy;
                c->rlbytes -= tocopy;
                c->rcurr += tocopy;
                c->rbytes -= tocopy;
                if (c->rlbytes == 0) {
                    break;
                }
            }

            /*  now try reading from the socket */
            res = read(c->sfd, c->ritem, c->rlbytes);
            if (res > 0) {
                pthread_mutex_lock(&c->thread->stats.mutex);
                c->thread->stats.bytes_read += res;
                pthread_mutex_unlock(&c->thread->stats.mutex);
                if (c->rcurr == c->ritem) {
                    c->rcurr += res;
                }
                c->ritem += res;
                c->rlbytes -= res;
                break;
            }
            if (res == 0) { /* end of stream */
                conn_set_state(c, conn_closing);
                break;
            }
            if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
                if (!update_event(c, EV_READ | EV_PERSIST)) {
                    if (settings.verbose > 0)
                        fprintf(stderr, "Couldn't update event\n");
                    conn_set_state(c, conn_closing);
                    break;
                }
                stop = true;
                break;
            }
            /* otherwise we have a real error, on which we close the connection */
            if (settings.verbose > 0) {
                fprintf(stderr, "Failed to read, and not due to blocking:\n"
                        "errno: %d %s \n"
                        "rcurr=%lx ritem=%lx rbuf=%lx rlbytes=%d rsize=%d\n",
                        errno, strerror(errno),
                        (long)c->rcurr, (long)c->ritem, (long)c->rbuf,
                        (int)c->rlbytes, (int)c->rsize);
            }
            conn_set_state(c, conn_closing);
            break;

        case conn_swallow:
            /* we are reading sbytes and throwing them away */
            if (c->sbytes == 0) {
                conn_set_state(c, conn_new_cmd);
                break;
            }

            /* first check if we have leftovers in the conn_read buffer */
            if (c->rbytes > 0) {
                int tocopy = c->rbytes > c->sbytes ? c->sbytes : c->rbytes;
                c->sbytes -= tocopy;
                c->rcurr += tocopy;
                c->rbytes -= tocopy;
                break;
            }

            /*  now try reading from the socket */
            res = read(c->sfd, c->rbuf, c->rsize > c->sbytes ? c->sbytes : c->rsize);
            if (res > 0) {
                pthread_mutex_lock(&c->thread->stats.mutex);
                c->thread->stats.bytes_read += res;
                pthread_mutex_unlock(&c->thread->stats.mutex);
                c->sbytes -= res;
                break;
            }
            if (res == 0) { /* end of stream */
                conn_set_state(c, conn_closing);
                break;
            }
            if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
                if (!update_event(c, EV_READ | EV_PERSIST)) {
                    if (settings.verbose > 0)
                        fprintf(stderr, "Couldn't update event\n");
                    conn_set_state(c, conn_closing);
                    break;
                }
                stop = true;
                break;
            }
            /* otherwise we have a real error, on which we close the connection */
            if (settings.verbose > 0)
                fprintf(stderr, "Failed to read, and not due to blocking\n");
            conn_set_state(c, conn_closing);
            break;

        case conn_write:
            /*
             * We want to write out a simple response. If we haven't already,
             * assemble it into a msgbuf list (this will be a single-entry
             * list for TCP or a two-entry list for UDP).
             */
            if (c->iovused == 0 || (IS_UDP(c->transport) && c->iovused == 1)) {
                if (add_iov(c, c->wcurr, c->wbytes) != 0) {
                    if (settings.verbose > 0)
                        fprintf(stderr, "Couldn't build response\n");
                    conn_set_state(c, conn_closing);
                    break;
                }
            }

            /* fall through... */

        case conn_mwrite:
          if (IS_UDP(c->transport) && c->msgcurr == 0 && build_udp_headers(c) != 0) {
            if (settings.verbose > 0)
              fprintf(stderr, "Failed to build UDP headers\n");
            conn_set_state(c, conn_closing);
            break;
          }
            switch (transmit(c)) {
            case TRANSMIT_COMPLETE:
                if (c->state == conn_mwrite) {
                    conn_release_items(c);
                    /* XXX:  I don't know why this wasn't the general case */
                    if(c->protocol == binary_prot) {
                        conn_set_state(c, c->write_and_go);
                    } else {
                        conn_set_state(c, conn_new_cmd);
                    }
                } else if (c->state == conn_write) {
                    if (c->write_and_free) {
                        free(c->write_and_free);
                        c->write_and_free = 0;
                    }
                    conn_set_state(c, c->write_and_go);
                } else {
                    if (settings.verbose > 0)
                        fprintf(stderr, "Unexpected state %d\n", c->state);
                    conn_set_state(c, conn_closing);
                }
                break;

            case TRANSMIT_INCOMPLETE:
            case TRANSMIT_HARD_ERROR:
                break;                   /* Continue in state machine. */

            case TRANSMIT_SOFT_ERROR:
                stop = true;
                break;
            }
            break;

        case conn_closing:
            if (IS_UDP(c->transport))
                conn_cleanup(c);
            else
                conn_close(c);
            stop = true;
            break;

        case conn_closed:
            /* This only happens if dormando is an idiot. */
            abort();
            break;

        case conn_max_state:
            assert(false);
            break;
        }
    }

因为有各种状态的存在，使回调函数event_handler可以处理所有事件，当事件就绪时候，全部都会调用event_handler，然后event_handler通过连接状态的不同，选择不同的分支进行操作。这就是有限状态机的好处。

conn_listening：主线程将listen fd加入其epoll时候，状态是这个，当连接到来，会依据这个状态选择accept分支。

//主线程事件就绪执行的部分。accept连接即可。
switch(c->state) {
        case conn_listening:
            addrlen = sizeof(addr);
            sfd = accept(c->sfd, (struct sockaddr *)&addr, &addrlen);
            if (sfd == -1) {
                if (use_accept4 && errno == ENOSYS) {
                    use_accept4 = 0;
                    continue;
                }
                perror(use_accept4 ? "accept4()" : "accept()");
                if (errno == EAGAIN || errno == EWOULDBLOCK) {
                    /* these are transient, so don't log anything */
                    stop = true;
                } else if (errno == EMFILE) {
                    if (settings.verbose > 0)
                        fprintf(stderr, "Too many open connections\n");
                    accept_new_conns(false);
                    stop = true;
                } else {
                    perror("accept()");
                    stop = true;
                }
                break;
            }
            if (!use_accept4) {
                if (fcntl(sfd, F_SETFL, fcntl(sfd, F_GETFL) | O_NONBLOCK) < 0) {
                    perror("setting O_NONBLOCK");
                    close(sfd);
                    break;
                }
            }

            if (settings.maxconns_fast &&
                stats.curr_conns + stats.reserved_fds >= settings.maxconns - 1) {
                str = "ERROR Too many open connections\r\n";
                res = write(sfd, str, strlen(str));
                close(sfd);
                STATS_LOCK();
                stats.rejected_conns++;
                STATS_UNLOCK();
            } else {
                //选定一个worker线程，new一个CQ_ITEM，把这个CQ_ITEM仍给这个线程. 
                dispatch_conn_new(sfd, conn_new_cmd, EV_READ | EV_PERSIST,
                                     DATA_BUFFER_SIZE, tcp_transport);
            }

            stop = true;
            break;



/*
主线程，选择一个子线程，并通过写管道通知子线程。
 */
void dispatch_conn_new(int sfd, enum conn_states init_state, int event_flags,
                       int read_buffer_size, enum network_transport transport) {
    CQ_ITEM *item = cqi_new();//申请CQ_ITEM.
    char buf[1];
    if (item == NULL) {
        close(sfd);
        /* given that malloc failed this may also fail, but let's try */
        fprintf(stderr, "Failed to allocate memory for connection object\n");
        return ;
    }

    int tid = (last_thread + 1) % settings.num_threads;//轮询很简单。一个一个给连接。

    LIBEVENT_THREAD *thread = threads + tid;//线程获取结构体。

    last_thread = tid;

    item->sfd = sfd;
    item->init_state = init_state;//conn_new_cmd
    item->event_flags = event_flags;//EV_READ | EV_PERSIST
    item->read_buffer_size = read_buffer_size;
    item->transport = transport;

    cq_push(thread->new_conn_queue, item);//将连接压入子线程对象的连接队列。

    MEMCACHED_CONN_DISPATCH(sfd, thread->thread_id);
    buf[0] = 'c';
    if (write(thread->notify_send_fd, buf, 1) != 1) {//往子线程管道写入一字节表示通知
        perror("Writing to thread notify pipe");
    }
}

这里一定要理解dispatch_conn_new做的操作：

1、新键一个CQ_ITEM元素，并通过轮询选择合适线程，并压入其连接队列。

2、通过写对应子线程管道通知子线程。

3、为什么每个子线程需要维护一个连接队列？

当工作线程正在处理一个时间较长的get请求时，这时主线程通过轮训已经分配给当前工作线程3个待处理连接(加入了队列，并往管道写了3个‘c’字符)，。这是管道可读就绪。工作线程再次从epoll_wait返回之后，从管道仅仅读取一字节，因为默认epoll是水平触发。所以下次继续可读，直到从管道读完所有字节并将其连接。这样就可以完全保证工作线程可以处理完所有的连接，丝毫没有问题。这就是多线程程序的优势所在，开销比进程小，仅仅需要一些简单的同步处理。例如上述读写CQ队列，就需要加锁。