深入剖析poll,epoll机制（源码分析）

最新推荐文章于 2024-09-09 22:29:38 发布

置顶若澜_

最新推荐文章于 2024-09-09 22:29:38 发布

阅读量602

点赞数

文章标签： epoll poll

本文链接：https://blog.csdn.net/sydney__/article/details/103832877

版权

之前理解了一下select,poll,epoll机制，知道他们的区别，但对于是什么造成这种区别或是具体什么操作让epoll拥有更好的性质还是一知半解，于是决心研究下他们的源码

一.首先来看poll机制

int poll (struct pollfd *fds, unsigned int nfds, int timeout);

其中nfds是fds数组中pollfd对象的个数，timeout是poll函数超时的时间

poll机制用一个函数即可实现监控文件对象，它的文件对象是存储在pollfd结构中的

struct pollfd {
    int fd; /* file descriptor */
    short events; /* requested events to watch */
    short revents; /* returned events witnessed */
};

其中fd是被监听的文件描述符；events是告诉内核需要监听的事件；reevents是对文件描述符进行的操作事件。如果返回值>0则表示数组fds中有事件发生的描述符数量，如果所检测的描述符合集中没有任何事件发生，那它会一直阻塞，知道超时返回0；如果调用失败就返回-1，来简单实践下

int main(){
    int sockfd;
    struct pollfd pollfds;  //poll对象
    int timeout = 10000;
    
    pollfds.fd = sockfd;    //设置监控的sockfd
    pollfds.events = POLLIN|POLLPRI;    //设置监控的事件
    
    switch(poll(&pollfds,1,timeout)){   //开始监控
        case -1:                
            printf("poll error\n");
            break;
        case 0:
            printf("time out\n");
            break;
        default:                
            printf("event happen\n");
            printf("event value is 0x%x",pollfds.revents);
            break;
        }
}

都知道Poll对于select的优点就是它没有最大连接数的限制，原因就它是用链表来存储监听对象的，下面来看看它的具体实现机制

程序调用poll函数时会通过中断进入内核层，然后调用一系列内核函数，调用流程如下

1.Sys_poll函数：对timeout处理，将其换成10ms精度，然后调用do_sys_poll

2.do_sys_poll函数：主要是执行两个函数

poll_initwait(&table);

fdcount = do_poll(nfds, head,&table, timeout);

其中poll_initwait函数主要是初始化一个poll_wqueue变量即table,这个变量的用处后面讲

3.do_poll函数：处理超时，看循环里面是循环了poll监听的文件描述符链表，对于每个pollfd，如果do_poll成功count就加1

static int do_poll(unsigned int nfds,  struct poll_list *list,
                   struct poll_wqueues *wait, s64 *timeout)
{
    int count = 0;
    poll_table* pt = &wait->pt;

    if (!(*timeout))//处理没有超时的情况
        pt = NULL;

    for (;;) {//大循环，一直等待超时时间到或者有相应的事件触发唤醒进程
        struct poll_list *walk;
        long __timeout;

        set_current_state(TASK_INTERRUPTIBLE);//设置当前进程为可中断状态
        for (walk = list; walk != NULL; walk = walk->next) {//循环poll_fd列表
            struct pollfd * pfd, * pfd_end;

            pfd = walk->entries;
            pfd_end = pfd + walk->len;
            for (; pfd != pfd_end; pfd++) {

                if (do_pollfd(pfd, pt)) {//pwait = table->pt。调用驱动的poll函数获取mask值，另外将进程放入等待队列
                    count++;
                    pt = NULL;
                }
            }
        }

        pt = NULL;
        if (count || !*timeout || signal_pending(current))//如果超时时间到了或者没有poll_fd或者事件发生了，直接退出
            break;
        count = wait->error;
        if (count)
            break;

        if (*timeout < 0) {
            __timeout = MAX_SCHEDULE_TIMEOUT;
        } else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT-1)) {

            __timeout = MAX_SCHEDULE_TIMEOUT - 1;
            *timeout -= __timeout;
        } else {
            __timeout = *timeout;
            *timeout = 0;
        }

        __timeout = schedule_timeout(__timeout);//设置超时时间，进程休眠
        if (*timeout >= 0)
            *timeout += __timeout;
    }
    __set_current_state(TASK_RUNNING);//重新运行调用sys_poll的进程
    return count;
}

4.对于每个文件对象是调用do_pollfd(pfd, pt)，在这里面就是将每个pollfd对象挂载到相应的文件的file_operation->poll(),当文件发生变化就会反过来通知pollfd

static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
{
    unsigned int mask;
    int fd;

    mask = 0;
    fd = pollfd->fd;//根据pollfd找到文件节点
    if (fd >= 0) {
        int fput_needed;
        struct file * file;

        file = fget_light(fd, &fput_needed);//根据文件节点fd找到文件的file结构
        mask = POLLNVAL;
        if (file != NULL) {
            mask = DEFAULT_POLLMASK;
            if (file->f_op && file->f_op->poll) //f_op:与文件相关联的操作函数
                mask = file->f_op->poll(file, pwait);//调用它的poll函数，并且返回mask，并将当前进程加到队列中
            /* Mask out unneeded events. */
            mask &= pollfd->events | POLLERR | POLLHUP;
            fput_light(file, fput_needed);
        }
    }
    pollfd->revents = mask;

    return mask;
}

理解下来整体流程是先会将监听的所有pollfd对象拷贝到内核中，在内核里循环遍历所有的fd，有事件就count++，最后返回，一直无事件并且超时了就返回0。在之前先将相应的对象挂载到其FILE* file的f_op->poll，这样当有事件发生就会回调将本进程唤醒，然后进程又来遍历全部的fd,然后保存有事件发生的对象，然后又拷贝到用户空间。可以看到会把fd合集拷贝两次，开销很大

二.epoll机制

　　epoll是对select和poll的改进，它提供了三个函数

int epoll_create(int size);    //创建一个 epoll 的句柄

int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);    //可增删改监听对象

/*等待事件的产生*/
int epoll_wait( int epfd, struct epoll_event * events, int maxevents, int timeout );

关于其详细的使用在另一篇博客里有总结epoll机制详解，这里主要是分析它的源码

1.先来看其中一些关键的结构体

首先是epoll对象结构体

struct eventpoll {
    spinlock_t lock;
    struct mutex mtx;
    wait_queue_head_t wq;

    wait_queue_head_t poll_wait;

    struct list_head rdllist;   //所有ready的文件描述符的epitem存储在这个链表里

    struct rb_root rbr; //所有监听的文件对象，是红黑树存储

    struct epitem *ovflist; //这是一个单链表链接着所有的struct epitem

    struct user_struct *user;   //这里保存了一些用户变量, 比如fd监听数量的最大值等等
};

里面用到红黑树来保存所有的监听对象，每个节点是一个epitem结构体，对应一个文件描述符

struct epitem {
    //每个epitem对应一个fd,fd以epitem的形式存储在红黑树中
    struct rb_node rbn; //红黑树节点
    struct list_head rdllink;   //链表节点, 所有活动的epitem都会被链到eventpoll的rdllist中
    
    struct epitem *next;    //链表的下一节点
    struct epoll_filefd ffd;    // epitem对应的fd和struct file
    int nwait;  //其文件对象上活跃的事件个数
    
    struct list_head pwqlist;   //文件对象上等待链表表头
   
    struct eventpoll *ep;   //记录他属于那个epoll对象
    struct list_head fllink;
    struct epoll_event event;   //感兴趣的事件
};

其中pwalist是与文件对象相关联的一个变量看到后面就能理解其含义，然后是其中的epoll_filefd结构体，里面就是保存了文件对象及其描述符

struct epoll_filefd {
    struct file *file;
    int fd;
};

然后是epoll_entry结构体，其主要是记录一个文件对象的等待队列

struct eppoll_entry {
    struct list_head llink; //与epitem相联系，一个eppoll_entry对应一个epitem
    struct epitem *base;
    wait_queue_t wait;  //一个文件对象的等待队列
    wait_queue_head_t *whead;   //等待队列对头
};

然后是在poll队列上挂载的对象，每个epitem对象会挂载到文件对象的等待队列里

struct ep_pqueue {
    poll_table pt;
    struct epitem *epi;
};

主要的结构体就这些，现在看到或许还有些茫然，没关系，再往下分析就会明了

2.相关函数

（1）epoll_create

SYSCALL_DEFINE1(epoll_create, int, size)
{
if (size <= 0)
return -EINVAL;
return sys_epoll_create1(0);
}

SYSCALL_DEFINE1(epoll_create1, int, flags)
{
int error;
struct eventpoll *ep = NULL;//主描述符
BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
if (flags & ~EPOLL_CLOEXEC)
return -EINVAL;

error = ep_alloc(&ep);  //构造一个 struct eventpoll
if (error < 0)
return error;

//让epollfd对应一个文件描述符，分配fd,当对这个文件进行读写操作时f_op里的函数会进行操作，即驱动程序，就是能通过fd找到epollfd
error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
                         O_RDWR | (flags & O_CLOEXEC)); //让epollfd对应一个文件描述符，分配fd
if (error < 0)
ep_free(ep);
return error;
}

epoll_create函数主要就是创建一个eventpoll对象管理整个epoll,可以看到它也对应了一个文件描述符

(2)epoll_ctl

SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
{
    int error;
    struct file *file, *tfile;
    struct eventpoll *ep;
    struct epitem *epi;
    struct epoll_event epds;
    error = -EFAULT;
    if (ep_op_has_event(op) &&
    copy_from_user(&epds, event, sizeof(struct epoll_event)))
    goto error_return;

    error = -EBADF;
    file = fget(epfd);
    if (!file)
    goto error_return;
    tfile = fget(fd);
    if (!tfile)
    goto error_fput;

    error = -EPERM;

    if (!tfile->f_op || !tfile->f_op->poll) //将该进程加入文件等待序列
    goto error_tgt_fput;

    error = -EINVAL;

    if (file == tfile || !is_file_epoll(file))
        goto error_tgt_fput;

    ep = file->private_data;

    mutex_lock(&ep->mtx);
    epi = ep_find(ep, tfile, fd);   //首先查找该fd是不是已经存在了.
    error = -EINVAL;
    switch (op) {   //各个操作类型对应红黑树中相应的操作
        case EPOLL_CTL_ADD:
            if (!epi) {
            epds.events |= POLLERR | POLLHUP;   //添加
            error = ep_insert(ep, &epds, tfile, fd);
            } else
            error = -EEXIST;
            break;

        case EPOLL_CTL_DEL: //删除
            if (epi)
            error = ep_remove(ep, epi);
            else
            error = -ENOENT;
            break;
        case EPOLL_CTL_MOD:
            if (epi) {
            epds.events |= POLLERR | POLLHUP;
            error = ep_modify(ep, epi, &epds);
            } else
            error = -ENOENT;
            break;
            }
        mutex_unlock(&ep->mtx);
    error_tgt_fput:
    fput(tfile);
    error_fput:
    fput(file);
    error_return:
    return error;
}

根据用户设置的操作类型来调用相应的函数，然后就是随影红黑树里的增加，删除，修改操作

来着重看看插入函数

static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
                     struct file *tfile, int fd)
{
    int error, revents, pwake = 0;
    unsigned long flags;
    struct epitem *epi;
    struct ep_pqueue epq;
    // 查看是否达到当前用户的最大监听数 
    if (unlikely(atomic_read(&ep->user->epoll_watches) >=
                 max_user_watches))
        return -ENOSPC;

    if (!(epi = kmem_***_alloc(epi_***, GFP_KERNEL)))
        return -ENOMEM;

    INIT_LIST_HEAD(&epi->rdllink);
    INIT_LIST_HEAD(&epi->fllink);
    INIT_LIST_HEAD(&epi->pwqlist);
    epi->ep = ep;
    //将要监听的fd加入到创建的epitem中
    ep_set_ffd(&epi->ffd, tfile, fd);
    epi->event = *event;
    epi->nwait = 0;
    epi->next = EP_UNACTIVE_PTR;
    epq.epi = epi;
    /* 初始化一个poll_table
     * 其实就是指定调用poll_wait时的回调函数,和我们关心哪些events,
     * ep_ptable_queue_proc()是相应的回调函数 */
    init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
    
    // 将其挂载到文件对象等待队列上
    revents = tfile->f_op->poll(tfile, &epq.pt);
   
    error = -ENOMEM;
    if (epi->nwait < 0)
        goto error_unregister;
    //这个就是每个文件会将所有监听自己的epitem链起来 
    spin_lock(&tfile->f_lock);
    list_add_tail(&epi->fllink, &tfile->f_ep_links);
    spin_unlock(&tfile->f_lock);
    
    //将epitem插入到对应的eventpoll中去 
    ep_rbtree_insert(ep, epi);
    spin_lock_irqsave(&ep->lock, flags);
    // 到达这里后, 如果我们监听的fd已经有事件发生, 那就要处理一下 
    if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
        // 将当前的epitem加入到ready list中去 
        list_add_tail(&epi->rdllink, &ep->rdllist);
        // Notify waiting tasks that events are available 
        // 唤醒eventpoll中的等待进程即epoll_wait
        if (waitqueue_active(&ep->wq))
            wake_up_locked(&ep->wq);
        if (waitqueue_active(&ep->poll_wait))
            pwake++;
    }
    spin_unlock_irqrestore(&ep->lock, flags);
    atomic_inc(&ep->user->epoll_watches);
    if (pwake)
        ep_poll_safewake(&ep->poll_wait);
    return 0;
    error_unregister:
    ep_unregister_pollwait(ep, epi);
   
    spin_lock_irqsave(&ep->lock, flags);
    if (ep_is_linked(&epi->rdllink))
        list_del_init(&epi->rdllink);
    spin_unlock_irqrestore(&ep->lock, flags);
    kmem_***_free(epi_***, epi);
    return error;
}

插入函数主要就是创建一个epoll_entry，设置其灰调函数，将其挂载到设备等待队列，当设备就绪就会唤醒等待队列上的所有等待进程，回调函数就会被调用，然后就会通过回调函数将fd放到eventpoll对象的rdlist上，然后eventpoll就是唤醒epoll_wait,然后epoll_wait就会手机rdlist中的epoll_event将其拷贝到用户空间。

下面ep_ptable_queue_proc就是指定回调函数，ep_poll_callback是回调函数

static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
                                 poll_table *pt)
{
    struct epitem *epi = ep_item_from_epqueue(pt);
    struct eppoll_entry *pwq;
    if (epi->nwait >= 0 && (pwq = kmem_***_alloc(pwq_***, GFP_KERNEL))) {
        /* 初始化等待队列, 指定ep_poll_callback为唤醒时的回调函数,
         * 当我们监听的fd发生状态改变时, 也就是队列头被唤醒时,
         * 指定的回调函数将会被调用. */
        init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
        pwq->whead = whead;
        pwq->base = epi;
        // 将刚分配的等待队列成员加入到头中, 头是由fd持有的 
        add_wait_queue(whead, &pwq->wait);
        list_add_tail(&pwq->llink, &epi->pwqlist);
        epi->nwait++;
    } else {
        epi->nwait = -1;
    }
}

static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
    int pwake = 0;
    unsigned long flags;
    struct epitem *epi = ep_item_from_wait(wait);//从等待队列获取epitem.需要知道哪个进程挂载到这个设备
    struct eventpoll *ep = epi->ep;//获取
    spin_lock_irqsave(&ep->lock, flags);
    if (!(epi->event.events & ~EP_PRIVATE_BITS))
        goto out_unlock;
    if (key && !((unsigned long) key & epi->event.events))
    if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
        if (epi->next == EP_UNACTIVE_PTR) {
            epi->next = ep->ovflist;
            ep->ovflist = epi;
        }
        goto out_unlock;
    }
    // 将当前的epitem放入ready list 
    if (!ep_is_linked(&epi->rdllink))
        list_add_tail(&epi->rdllink, &ep->rdllist);
    // 唤醒epoll_wait... 
    if (waitqueue_active(&ep->wq))
        wake_up_locked(&ep->wq);
    if (waitqueue_active(&ep->poll_wait))
        pwake++;
    out_unlock:
    spin_unlock_irqrestore(&ep->lock, flags);
    if (pwake)
        ep_poll_safewake(&ep->poll_wait);
    return 1;
}

回调函数就是把收到event的epitem插入ep->rdllist中

到这其实整个epoll的底层机制就比较明了了，它相对于select,poll就是一种空间换时间的思想，它会在内核区域建立一个红黑树，每次添加一个监听对象就将其拷贝到内核包装为eptiem对象，然后挂载到其设备等待队列上设置回调函数，接着就添加进红黑树进行管理，当设备就绪，唤醒等待队列上的等待者时，就会调用这个回调函数，而这个回调函数会把就绪的fd加入到rdlist，接着唤醒epoll_wait进程进行后续处理