memcached网络层操作详解

[liexusong原创]

memcached的网络层处理大概如下:

memcached网络层操作详解1

(1)  main函数创建一个侦听服务端的socket, 然后添加到libevent, 然后当有客户端连接时就会触发libevent事件, 并且调用drive_machine()函数, 就像上面图一样.

(2)  drive_machine()函数会根据connstate字段来判断一个要进行什么操作. memcached中有7种的操作状态, 定义如下:

 

enum conn_states {

    conn_listening

    conn_read,      

    conn_write,     

    conn_nread,     

    conn_swallow,   

    conn_closing,   

    conn_mwrite     

};

 

drive_machine()函数大概如下:

void drive_machine(conn *c) {

    int exit = 0;

    int sfd, flags = 1;

    socklen_t addrlen;

    struct sockaddr addr;

    conn *newc;

    int res;

 

    while (!exit) {

        switch(c->state) {

        case conn_listening:

            addrlen = sizeof(addr);

            if ((sfd = accept(c->sfd, &addr, &addrlen)) == -1) {

                if (errno == EAGAIN || errno == EWOULDBLOCK) {

                    exit = 1;

                    break;

                } else {

                    perror("accept()");

                }

                break;

            }

            //设置socket为非阻塞

            if ((flags = fcntl(sfd, F_GETFL, 0)) < 0 ||

                fcntl(sfd, F_SETFL, flags | O_NONBLOCK) < 0) {

                perror("setting O_NONBLOCK");

                close(sfd);

                break;

            }

 

            newc = conn_new(sfd, conn_read, EV_READ | EV_PERSIST);

            if (!newc) {

                if (settings.verbose > 0)

                    fprintf(stderr, "couldn't create new connection\n");

                close(sfd);

                break;

            }

 

            break;

        case conn_read:

            if (try_read_command(c)) {

                continue;

            }

            if (try_read_network(c)) {

                continue;

            }

           

            if (!update_event(c, EV_READ | EV_PERSIST)) {

                if (settings.verbose > 0)

                    fprintf(stderr, "Couldn't update event\n");

                c->state = conn_closing;

                break;

            }

            exit = 1;

           break;

            … … …

        }

        … … …

    }

}

 

drive_machine()函数大概就是根据conn *cstate字段来判断要进行的操作. 每个客户端连接到memcached之后都会把他添加到libevent, 然后就触发事件调用这个函数. 所以说这个函数是最重要的.

 

[1]conn_listening状态:

这个状态主要是侦听客户端连接, 把他设置为非阻塞, 然后通过调用conn_new()函数把他放到libevent. conn_new()函数大概如下:

conn *conn_new(int sfd, int init_state, int event_flags) {

    conn *c;

 

   

if (freecurr > 0) {

        //如果在freeconns中有数据, 直接去freeconns中取

        c = freeconns[--freecurr];

    } else {

        if (!(c = (conn *)malloc(sizeof(conn)))) {

            perror("malloc()");

            return 0;

        }

        c->rbuf = c->wbuf = 0;

        c->ilist = 0;

 

        c->rbuf = (char *) malloc(DATA_BUFFER_SIZE);//2048

        c->wbuf = (char *) malloc(DATA_BUFFER_SIZE);//2048

       

        c->ilist = (item **) malloc(sizeof(item *)*200);//分配200item指针

 

        //分配内存失败?

        if (c->rbuf == 0 || c->wbuf == 0 || c->ilist == 0) {

            if (c->rbuf != 0) free(c->rbuf);

            if (c->wbuf != 0) free(c->wbuf);

            if (c->ilist !=0) free(c->ilist);

            free(c);

            perror("malloc()");

            return 0;

        }

       

        c->rsize = c->wsize = DATA_BUFFER_SIZE;//2048

        c->isize = 200;

        stats.conn_structs++;//状态中增加一个conn

    }

 

    if (settings.verbose > 1) {

        if (init_state == conn_listening)

            fprintf(stderr, "<%d server listening\n", sfd);

        else

            fprintf(stderr, "<%d new client connection\n", sfd);

    }

 

    c->sfd = sfd;//保存socket文件描述符

    c->state = init_state;//初始化的处理状态

    c->rlbytes = 0;

    c->rbytes = c->wbytes = 0;

    c->wcurr = c->wbuf;//指向当前的缓存位置

    c->rcurr = c->rbuf;

    c->icurr = c->ilist;

    c->ileft = 0;

    c->iptr = c->ibuf;

    c->ibytes = 0;

 

    c->write_and_go = conn_read;//写完后进行读操作

    c->write_and_free = 0;

    c->item = 0;

 

    c->is_corked = 0;

 

    //设置event事件

    event_set(&c->event, sfd, event_flags, event_handler, (void *)c);

    c->ev_flags = event_flags;

 

    if (event_add(&c->event, 0) == -1) {

        free(c);

        return 0;

    }

 

    stats.curr_conns++;

    stats.total_conns++;

 

    return c;

}

 

conn_new()函数主要是新建一个conn对象, 并且初始化他的属性.

 

 

[2]conn_read状态:

状态conn_read处理中有两个函数比较重要, 就是try_read_network() 和try_read_command(). try_read_network()是从网络端处读取数据, 当读到足够的数据时调用try_read_command()来处理数据, try_read_command()函数调用parse_command()函数来解析命令, 然后根据命令的类型来改变连接的状态.

 

try_read_network()函数如下:

int try_read_network(conn *c) {

    int gotdata = 0;

    int res;

    while (1) {

        if (c->rbytes >= c->rsize) {

            char *new_rbuf = realloc(c->rbuf, c->rsize*2);

            if (!new_rbuf) {

                if (settings.verbose > 0)

                    fprintf(stderr, "Couldn't realloc input buffer\n");

                c->rbytes = 0;

                out_string(c, "SERVER_ERROR out of memory");

                c->write_and_go = conn_closing;

                return 1;

            }

            c->rbuf = new_rbuf; c->rsize *= 2;

        }

        res = read(c->sfd, c->rbuf + c->rbytes, c->rsize - c->rbytes);

        if (res > 0) {

            stats.bytes_read += res;

            gotdata = 1;

            c->rbytes += res;

            continue;

        }

        if (res == 0) {

           

            c->state = conn_closing;

            return 1;

        }

        if (res == -1) {

            if (errno == EAGAIN || errno == EWOULDBLOCK) break;

            else return 0;

        }

    }

    return gotdata;

}

 

这个函数的主要操作是, 从客服端处读取数据, 直到没有数据可读为止[res == -1 &&  (errno == EAGAIN || errno == EWOULDBLOCK)]. 读取数据的同时改变connrbytes字段, 表示已经读到的数据大小. 当read()调用返回0时表示连接已经断开, 修改连接的状态为关闭, 下次操作会从libevent中去掉.

 

try_read_command()函数如下:

int try_read_command(conn *c) {

    char *el, *cont;

 

    if (!c->rbytes)

        return 0;

    el = memchr(c->rbuf, '\n', c->rbytes);

    if (!el)

        return 0;

    cont = el + 1;

    //略过\r\n

    if (el - c->rbuf > 1 && *(el - 1) == '\r') {

        el--;

    }

    *el = '\0';

 

    process_command(c, c->rbuf);

 

    if (cont - c->rbuf < c->rbytes) {

        memmove(c->rbuf, cont, c->rbytes - (cont - c->rbuf));

    }

    c->rbytes -= (cont - c->rbuf);

    return 1;

}

 

try_read_command()函数主要是当从客户端读取的数据足够时才会处理的, 就是当读到有换行符\n时才会处理, 因为命令一般是以换行符结束的. 然后通过调用process_command()来解析命令. 解析完命令后, 把命令从缓存中去掉, 剩下剩余的数据, 操作如下:

if (cont - c->rbuf < c->rbytes) {

    memmove(c->rbuf, cont, c->rbytes - (cont - c->rbuf));

}


效果如下图:

memcached网络层操作详解1

其中绿色段是数据, 黄色段是命令, 处理完命令后, 会用数据段把命令段覆盖, 是通过memmove(c->rbuf, cont, c->rbytes - (cont - c->rbuf));这条语句来处理的.

process_command()函数如下:

void process_command(conn *c, char *command) {

   

    int comm = 0;

    int incr = 0;

 

   

 

    if (settings.verbose > 1)

        fprintf(stderr, "<%d %s\n", c->sfd, command);

 

   

    set_cork(c, 1);

 

    if ((strncmp(command, "add ", 4) == 0 && (comm = NREAD_ADD)) ||

        (strncmp(command, "set ", 4) == 0 && (comm = NREAD_SET)) ||

        (strncmp(command, "replace ", 8) == 0 && (comm = NREAD_REPLACE))) {

 

        char key[251];

        int flags;

        time_t expire;

        int len, res;

        item *it;

 

        //<command name> <key> <flags> <exptime> <bytes>\r\n

        //忽略command

        res = sscanf(command, "%*s %0s %u %lu %d\n", key, &flags, &expire, &len);

        if (res!=4 || strlen(key)==0 ) {

            out_string(c, "CLIENT_ERROR bad command line format");

            return;

        }

        expire = realtime(expire);

        it = item_alloc(key, flags, expire, len+2);//alloc new item

        if (it == 0) {

            out_string(c, "SERVER_ERROR out of memory");

           

            c->write_and_go = conn_swallow;

            c->sbytes = len+2;

            return;

        }

 

        c->item_comm = comm;

        c->item = it;

        c->rcurr = ITEM_data(it);//point to item data area (指向item的数据域)

        c->rlbytes = it->nbytes;

        c->state = conn_nread;

        return;

}

… …

}

 

process_command()函数主要是解析命令, 然后进行制定的操作. 如上代码, 就是处理add/set/replace命令的操作.最后完成add/set/replace操作后, 会把连接的状态设置为conn_nread. conn_nread状态的操作是从客户端中读取完整的数据. 然后完成add/set/replace的所有操作, 如把数据存储在item, 并且连接这个item. 最后把connrcurr指针指向item的数据域. 这样做是为了下面的conn_nread操作时可以直接把数据读到item的数据域中.

 

[3]conn_nread状态:

conn_nread状态是要从客户端连接处读取没有读完的数据, 并且把数据放到在conn_read状态时的item的数据域中.

 

 

[4]conn_write状态:

conn_write状态之前先要说说out_string()函数, 定义如下:

void out_string(conn *c, char *str) {

    int len;

 

    if (settings.verbose > 1)

        fprintf(stderr, ">%d %s\n", c->sfd, str);

 

    len = strlen(str);

    if (len + 2 > c->wsize) {

       

        str = "SERVER_ERROR output line too long";

        len = strlen(str);

    }

 

    strcpy(c->wbuf, str);

    strcat(c->wbuf, "\r\n");

    c->wbytes = len + 2;

    c->wcurr = c->wbuf;

 

    c->state = conn_write;

    c->write_and_go = conn_read;

    return;

}

 

就是把str字符串写到connwbuf缓存中, 然后设置conn的状态为conn_write, 表示下次操作要进行conn_write相关的操作, 最后设置connwrite_and_goconn_read, 表示写操作完毕以后要进行读操作.

 

conn_write状态对应的操作就是: 把wbuf缓存的数据写到客户端连接处, 操作如下:

case conn_write:

   

   

    if (c->wbytes == 0) {

        if (c->write_and_free) {

            free(c->write_and_free);

            c->write_and_free = 0;

        }

        c->state = c->write_and_go;

        if (c->state == conn_read)

            set_cork(c, 0);

        break;

    }

    res = write(c->sfd, c->wcurr, c->wbytes);

    if (res > 0) {

        stats.bytes_written += res;

        c->wcurr  += res;

        c->wbytes -= res;

        break;

    }

    if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) {

        if (!update_event(c, EV_WRITE | EV_PERSIST)) {

            if (settings.verbose > 0)

                fprintf(stderr, "Couldn't update event\n");

            c->state = conn_closing;

            break;

        }               

        exit = 1;

        break;

    }

   

    if (settings.verbose > 0)

        fprintf(stderr, "Failed to write, and not due to blocking\n");

    c->state = conn_closing;

    break;

 

conn_write状态操作一般是调用out_string()函数设置的, 作用一般都是把一些消息发给客户端连接. 调用out_string()函数一般会立即执行conn_read状态对应的操作. 因为out_string()没有修改libevent的事件, 所以要立即执行conn_read状态对应的操作才行, 不然就不会触发相关的事件. 而执行conn_read状态操作时, 如果客户端不可写(write调用返回-1, 并且errno == EAGAIN || errno == EWOULDBLOCK), 那么就会修改libevent的事件为写(EV_WRITE) [ update_event(c, EV_WRITE | EV_PERSIST)], 之后就等libevent事件触发才再次执行conn_write状态对应的操作了.

 

[5]conn_mwrite状态:

conn_mwrite状态是当用户要执行get命令时设置的. get命令处理如下:

if (strncmp(command, "get ", 4) == 0) {

 

    char *start = command + 4;

    char key[251];

    int next;

    int i = 0;

    item *it;

    time_t now = time(0);

 

    while(sscanf(start, " %0s%n", key, &next) >= 1) {

        start+=next;

        stats.get_cmds++;

        it = assoc_find(key);

        if (it && (it->it_flags & ITEM_DELETED)) {

            it = 0;

        }

        if (settings.oldest_live && it &&

            it->time <= settings.oldest_live) {

            it = 0;

        }

        if (it && it->exptime && it->exptime < now) {

            item_unlink(it);

            it = 0;

        }

 

        if (it) {

            stats.get_hits++;

            it->refcount++;

            item_update(it);

            *(c->ilist + i) = it;

            i++;

            if (i > c->isize) {

                c->isize *= 2;

                c->ilist = realloc(c->ilist, sizeof(item *)*c->isize);

            }

        } else stats.get_misses++;

    }

    c->icurr = c->ilist;

    c->ileft = i;

    if (c->ileft) {

        c->ipart = 0;

        c->state = conn_mwrite;

        c->ibytes = 0;

        return;

    } else {

        out_string(c, "END");

        return;

    }

}

 

此操作主要是根据用户提供的key来查找相关的item, 并且把item方法connitem列表中供conn_mwrite状态操作时使用.

 

conn_mrwrite操作时最复杂的, 代码如下:

case conn_mwrite:

   

    if (c->ibytes > 0) {

        res = write(c->sfd, c->iptr, c->ibytes);

        if (res > 0) {

            stats.bytes_written += res;

            c->iptr += res;

            c->ibytes -= res;

            break;

        }

        if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) {

            if (!update_event(c, EV_WRITE | EV_PERSIST)) {

                if (settings.verbose > 0)

                    fprintf(stderr, "Couldn't update event\n");

                c->state = conn_closing;

                break;

            }

            exit = 1;

            break;

        }

       

        if (settings.verbose > 0)

            fprintf(stderr, "Failed to write, and not due to blocking\n");

        c->state = conn_closing;

        break;

    } else {

 

        item *it;

       

        switch (c->ipart) {

        case 1:

            it = *(c->icurr);

            assert((it->it_flags & ITEM_SLABBED) == 0);

            c->iptr = ITEM_data(it);

            c->ibytes = it->nbytes;

            c->ipart = 2;

            break;

        case 2:

            it = *(c->icurr);

            item_remove(it);

            c->ileft--;

            if (c->ileft <= 0) {

                c->ipart = 3;

                break;

            } else {

                c->icurr++;

            }

           

        case 0:

            it = *(c->icurr);

            assert((it->it_flags & ITEM_SLABBED) == 0);

            sprintf(c->ibuf, "VALUE %s %u %u\r\n", ITEM_key(it), it->flags, it->nbytes - 2);

            if (settings.verbose > 1)

                fprintf(stderr, ">%d sending key %s\n", c->sfd, ITEM_key(it));

            c->iptr = c->ibuf;

            c->ibytes = strlen(c->iptr);

            c->ipart = 1;

            break;

        case 3:

            out_string(c, "END");

            break;

        }

    }

    break;

 

这个操作主要是把get命令处获取到的item的数据发送到客户端连接.

主要难点是: conn_mwrite操作是遍历所有获取到的item, 然后把item的数据发送给客户端.

 

遍历的时候使用了一个switch来处理, 主要有4中情况, 如下:

(1) 0是发生key和数据的信息.

(2) 1是发生数据data.

(3) 2是减少itemrefcount和把当前item指针向后移动一个, 并且继续执行(1)的操作.

(4) 3是完成操作.

 

每次ibuf有数据的时候都会进行写操作, 代码如下:

    if (c->ibytes > 0) {

        res = write(c->sfd, c->iptr, c->ibytes);

        if (res > 0) {

            stats.bytes_written += res;

            c->iptr += res;

            c->ibytes -= res;

            break;

        }

        if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) {

            if (!update_event(c, EV_WRITE | EV_PERSIST)) {

                if (settings.verbose > 0)

                    fprintf(stderr, "Couldn't update event\n");

                c->state = conn_closing;

                break;

            }

            exit = 1;

            break;

        }

       

        if (settings.verbose > 0)

            fprintf(stderr, "Failed to write, and not due to blocking\n");

        c->state = conn_closing;

        break;

    }

 

ibuf没有数据时, 才会遍历从get命令操作取得的item列表, 并取得数据复制到ibuf.

 

[6] conn_swallow状态:

conn_swallow状态是当内存不足并且调用add/set/replace命令的时候才会出现, 此操作主要是读取客户端发送过来的数据, 并且忽略掉. 操作如下:

case conn_swallow:

   

    if (c->sbytes == 0) {

        c->state = conn_read;

        break;

    }

 

   

    if (c->rbytes > 0) {

        int tocopy = c->rbytes > c->sbytes ? c->sbytes : c->rbytes;

        c->sbytes -= tocopy;

        if (c->rbytes > tocopy) {

            memmove(c->rbuf, c->rbuf+tocopy, c->rbytes - tocopy);

        }

        c->rbytes -= tocopy;

        break;

    }

 

   

    res = read(c->sfd, c->rbuf, c->rsize > c->sbytes ? c->sbytes : c->rsize);

    if (res > 0) {

        stats.bytes_read += res;

        c->sbytes -= res;

        break;

    }

    if (res == 0) {

        c->state = conn_closing;

        break;

    }

    if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) {

        if (!update_event(c, EV_READ | EV_PERSIST)) {

            if (settings.verbose > 0)

                fprintf(stderr, "Couldn't update event\n");

            c->state = conn_closing;

            break;

        }

        exit = 1;

        break;

    }

   

    if (settings.verbose > 0)

        fprintf(stderr, "Failed to read, and not due to blocking\n");

    c->state = conn_closing;

    break;


至此memcached所有的网络层操作已经全部分析完.
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值