epoll的内核实现

Linux加油站

已于 2023-03-10 17:35:36 修改

阅读量137

点赞数

文章标签： linux 网络 Powered by 金山文档

于 2023-03-09 21:04:11 首次发布

本文链接：https://blog.csdn.net/m0_74282605/article/details/129431354

版权

select/poll的缺点

A. 每次调用时重复的从用户态读入参数

B. 每次调用时全量的扫描文件描述符

C. 每次调用开始，将进程加入到每个文件描述符的等待队列，在调用结束后又把进程从等待队列中删除。

D. 在不修改内核的情况下，select最多支持1024个文件描述符。

文件系统中的一些重要结构

在linux中，进程通过file_struct结构与文件关联，而文件通过等待队列与进程关联，进而形成一种多对多的关系。

首先，文件对象struct file_struct

struct files_struct {  
    atomic_t        count;              //自动增量  
    struct fdtable  *fdt;  
    struct fdtable  fdtab;  
    fd_set      close_on_exec_init;     //执行exec时需要关闭的文件描述符集合  
    fd_set      open_fds_init;          //当前打开文件的文件描述符屏蔽字  
    struct file  *fd_array[NR_OPEN_DEFAULT];//文件对象数组  
    spinlock_t  file_lock;    
};

该结构在进程的task_struct中使用，用于保存进程当前打开的文件集合。其中struct file称为文件对象，保存文件的信息，这里我们仅列出我们可能会用到的成员

struct file{  
     ....  
     struct file_operations *f_op;  
     ...  
}

在struct file_operations对象中存储对文件对象可以进行各种操作的指针：

struct file_operations {  
  struct module *owner;  
  loff_t(*llseek) (struct file *, loff_t, int);  
  ssize_t(*read) (struct file *, char __user *, size_t, loff_t *);  
  ssize_t(*aio_read) (struct kiocb *, char __user *, size_t, loff_t);  
  ssize_t(*write) (struct file *, const char __user *, size_t, loff_t *);  
  ssize_t(*aio_write) (struct kiocb *, const char __user *, size_t, loff_t);  
  int (*readdir) (struct file *, void *, filldir_t);  
  unsigned int (*poll) (struct file *, struct poll_table_struct *);  
  int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);  
  int (*mmap) (struct file *, struct vm_area_struct *);  
  int (*open) (struct inode *, struct file *);  
  int (*flush) (struct file *);  
  int (*release) (struct inode *, struct file *);  
  int (*fsync) (struct file *, struct dentry *, int datasync);  
  int (*aio_fsync) (struct kiocb *, int datasync);  
  int (*fasync) (int, struct file *, int);  
  int (*lock) (struct file *, int, struct file_lock *);  
  ssize_t(*readv) (struct file *, const struct iovec *, unsigned long, loff_t *);  
  ssize_t(*writev) (struct file *, const struct iovec *, unsigned long, loff_t *);  
  ssize_t(*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void __user *);  
  ssize_t(*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);  
  unsigned long (*get_unmapped_area) (struct file *, unsigned long,  
         unsigned long, unsigned long,  
         unsigned long);  
};

epoll模型

epoll自己保存传入的文件描述符，同时通过设备等待队列唤醒时调用“回调函数”实现事件的通知。

epoll模型将select/poll单个的操作拆分:

int epoll_create(int size);   
int epoll_ctl(int epfd, int op, int fd ,struct epoll_event *event);  
int epoll_wait(int epfd, struct epoll_event *events, int maxevents, int timeout);

epoll机制实现了自己特有的文件系统eventpoll filesystem。

epoll_create闯将一个属于该文件系统的文件，并返回其文件描述符。struct eventpoll 保存了epoll文件节点的扩展信息，该结构保存在private_data域中，每个由epoll_create得到的文件描述符都分配了一个该结构体，让我们来看一下struct eventpll的内部结构：

struct eventpoll {  
    /* 用于维护自身的状态，可用于中断上下文 */  
    spinlock_t lock;  
    /*  
     * 用户进程上下文中  
     */  
    struct mutex mtx;  
    /* 进程等待队列，由 sys_epoll_wait()使用，调用epoll_wait时，休眠在这里 */  
    wait_queue_head_t wq;  
    /* 进程等待队列，由 file->poll()使用 ，epollfd本身被poll时，休眠在这里*/  
    wait_queue_head_t poll_wait;  
    /* 就绪文件描述符链表 */  
    struct list_head rdllist;  
    /* 红黑树头节点，该红黑树用于存储要监控的文件描述符 */  
    struct rb_root rbr;  
    /*  
     * ready事件的临时存放链表  
     */  
    struct epitem *ovflist;  
    /* 创建eventpoll descriptor的用户 */  
    struct user_struct *user;  
};

epoll_ctl 接口加入该epoll描述符监听的套接字属于socket filesystem，每一个都对应一个epitem结构体，该结构以红黑树的方式存储，eventpoll中的rbr成员指向该红黑树的root节点，而有监听事件到来的套接字结构以双向连表的形式保存，其头结点对应eventpoll中的rdllist成员。

struct epitem {  
    /*红黑树节点 */  
    struct rb_node rbn;  
    /*就绪描述符链表节点 */  
    struct list_head rdllink;  
    /*  
     * Works together "struct eventpoll"->ovflist in keeping the  
     * single linked chain of items.  
     */  
    struct epitem *next;  
    /* 本结构对应的文件描述符信息 */  
    struct epoll_filefd ffd;  
    /* Number of active wait queue attached to poll operations */  
    int nwait;  
    /* List containing poll wait queues */  
    struct list_head pwqlist;  
    /* The "container" of this item */  
    struct eventpoll *ep;  
    /* List header used to link this item to the "struct file" items list */  
    struct list_head fllink;  
    /* The structure that describe the interested events and the source fd */  
    struct epoll_event event;  
};

上述结构中fllink是指向文件系统链表的立案表头，struct file 称为文件结构，一般代表一个打开的文件描述符。

而epoll_filefd结构则表明了epitem对应的文件描述符信息：

struct epoll_filefd {  
   struct file *file;  //文件结构指针  
   int fd;};           //对应的文件描述符

struct epoll_event event则表明了感兴趣的事件和原始的fd：

struct epoll_event  
{  
  uint32_t events;  /*想要监听的事件 */  
  epoll_data_t data;    /* 用户数据变量，可以用来存放一些用户自定义的信息 */  
} __attribute__ ((__packed__));  
  
typedef union epoll_data  
{  
  void *ptr;   //指向自定义数据  
  int fd;  
  uint32_t u32;  
  uint64_t u64;  
} epoll_data_t;

看完上面两个结构，让我们来看一下epoll_ctl的真身：

SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,  
        struct epoll_event __user *, event)  
{  
    int error;  
    struct file *file, *tfile;  
    struct eventpoll *ep;  
    struct epitem *epi;  
    struct epoll_event epds;  
    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)/n",  
             current, epfd, op, fd, event));  
    error = -EFAULT;  
    if (ep_op_has_event(op) &&  
        copy_from_user(&epds, event, sizeof(struct epoll_event)))  
        goto error_return;  
    /* Get the "struct file *" for the eventpoll file */  
    error = -EBADF;  
    file = fget(epfd);  
    if (!file)  
        goto error_return;  
    /* Get the "struct file *" for the target file */  
    tfile = fget(fd);  
    if (!tfile)  
        goto error_fput;  
    /* The target file descriptor must support poll */  
    error = -EPERM;  
    if (!tfile->f_op || !tfile->f_op->poll)  
        goto error_tgt_fput;  
    /*  
     * We have to check that the file structure underneath the file descriptor  
     * the user passed to us _is_ an eventpoll file. And also we do not permit  
     * adding an epoll file descriptor inside itself.  
     */  
    error = -EINVAL;  
    if (file == tfile || !is_file_epoll(file))  
        goto error_tgt_fput;  
    /*  
     * At this point it is safe to assume that the "private_data" contains  
     * our own data structure.  
     */  
    ep = file->private_data;  
    mutex_lock(&ep->mtx);  
    /*  
     * Try to lookup the file inside our RB tree, Since we grabbed "mtx"  
     * above, we can be sure to be able to use the item looked up by  
     * ep_find() till we release the mutex.  
     */  
    epi = ep_find(ep, tfile, fd);  
    error = -EINVAL;  
    switch (op) {  
    case EPOLL_CTL_ADD:  
        if (!epi) {  
            epds.events |= POLLERR | POLLHUP;  
            error = ep_insert(ep, &epds, tfile, fd);  
        } else  
            error = -EEXIST;  
        break;  
    case EPOLL_CTL_DEL:  
        if (epi)  
            error = ep_remove(ep, epi);  
        else  
            error = -ENOENT;  
        break;  
    case EPOLL_CTL_MOD:  
        if (epi) {  
            epds.events |= POLLERR | POLLHUP;  
            error = ep_modify(ep, epi, &epds);  
        } else  
            error = -ENOENT;  
        break;  
    }  
    mutex_unlock(&ep->mtx);  
error_tgt_fput:  
    fput(tfile);  
error_fput:  
    fput(file);  
error_return:  
    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d/n",  
             current, epfd, op, fd, event, error));  
    return error;  
}

代码的逻辑很简单，注释也很清楚。上面我们已经介绍了文件对象结构，因此我们这里主要关注剩下的流程，以添加一个监听事件为例：

static int ep_insert(struct eventpoll *ep, struct epoll_event *event,struct file *tfile, int fd)  
{  
    int error, revents, pwake = 0;  
    unsigned long flags;  
    struct epitem *epi;  
    struct ep_pqueue epq;  
    /* 不允许超过最大监听个数，每个用户均有一个监听上限*/  
    if (unlikely(atomic_read(&ep->user->epoll_watches) >=  
             max_user_watches))  
        return -ENOSPC;  
    if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))  
        return -ENOMEM;  
    /* Item initialization follow here ... */  
    INIT_LIST_HEAD(&epi->rdllink);  
    INIT_LIST_HEAD(&epi->fllink);  
    INIT_LIST_HEAD(&epi->pwqlist);  
    epi->ep = ep;  
    ep_set_ffd(&epi->ffd, tfile, fd);  
    epi->event = *event;  
    epi->nwait = 0;  
    epi->next = EP_UNACTIVE_PTR;  
    /* Initialize the poll table using the queue callback */  
    epq.epi = epi;  
    init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);  //注册事件响应的回调函数  
    /*  
     * Attach the item to the poll hooks and get current event bits.  
     * We can safely use the file* here because its usage count has  
     * been increased by the caller of this function. Note that after  
     * this operation completes, the poll callback can start hitting  
     * the new item.  
     */  
    revents = tfile->f_op->poll(tfile, &epq.pt);  //执行poll函数，对于socket来说，执行tcp_poll函数   
    /*  
     * We have to check if something went wrong during the poll wait queue  
     * install process. Namely an allocation for a wait queue failed due  
     * high memory pressure.  
     */  
    error = -ENOMEM;  
    if (epi->nwait < 0)  
        goto error_unregister;  
    /* Add the current item to the list of active epoll hook for this file */  
    spin_lock(&tfile->f_ep_lock);  
    list_add_tail(&epi->fllink, &tfile->f_ep_links);  
    spin_unlock(&tfile->f_ep_lock);  
    /*  
     * Add the current item to the RB tree. All RB tree operations are  
     * protected by "mtx", and ep_insert() is called with "mtx" held.  
     */  
    ep_rbtree_insert(ep, epi);  
    /* We have to drop the new item inside our item list to keep track of it */  
    spin_lock_irqsave(&ep->lock, flags);  
    /* If the file is already "ready" we drop it inside the ready list */  
    if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {  
        list_add_tail(&epi->rdllink, &ep->rdllist);  
        /* Notify waiting tasks that events are available */  
        if (waitqueue_active(&ep->wq))  
            wake_up_locked(&ep->wq);  
        if (waitqueue_active(&ep->poll_wait))  
            pwake++;  
    }  
    spin_unlock_irqrestore(&ep->lock, flags);  
    atomic_inc(&ep->user->epoll_watches);  
    /* We have to call this outside the lock */  
    if (pwake)  
        ep_poll_safewake(&psw, &ep->poll_wait);  
    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)/n",  
             current, ep, tfile, fd));  
    return 0;  
error_unregister:  
    ep_unregister_pollwait(ep, epi);  
    /*  
     * We need to do this because an event could have been arrived on some  
     * allocated wait queue. Note that we don't care about the ep->ovflist  
     * list, since that is used/cleaned only inside a section bound by "mtx".  
     * And ep_insert() is called with "mtx" held.  
     */  
    spin_lock_irqsave(&ep->lock, flags);  
    if (ep_is_linked(&epi->rdllink))  
        list_del_init(&epi->rdllink);  
    spin_unlock_irqrestore(&ep->lock, flags);  
    kmem_cache_free(epi_cache, epi);  
    return error;  
}

本函数将epitem对象添加到eventpoll中，实际上是将用户的输入保存到eventpoll文件系统中。init_poll_funcptr函数将ep_ptable_queue_proc函数注册到poll table中，然后程序的下一步是调用tfile的poll函数，并且函数的第二个参数为poll table。ep_ptable_queue_proc函数的主要作用是注册等待函数，并添加到指定的等待队列，第一次调用后此信息已经存在，无需在poll函数中在此调用了。

static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,poll_table *pt)  
{  
    struct epitem *epi = ep_item_from_epqueue(pt);  
    struct eppoll_entry *pwq;  
    if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {  
       /* 为监听套接字注册一个等待回调函数，在唤醒时调用*/  
        init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);  
        pwq->whead = whead;  
        pwq->base = epi;  
        add_wait_queue(whead, &pwq->wait);  
        list_add_tail(&pwq->llink, &epi->pwqlist);  
        epi->nwait++;  
    } else {  
        /* We have to signal that an error occurred */  
        epi->nwait = -1;  
    }  
}

实际上，如果我们仅是使用epoll则poll函数到底是什么我们是不需要关系的，严格来说，具体poll函数是怎么样的要看通过epoll_ctl传入的套接字类型。对于tcp套接字来说可以按照创建流程找到其对应的函数为tcp_poll, 该函数将进程注册到sock成员的sk_sleep等待队列中，在对应的IO事件中该队列被唤醒，进而调用相应的等待回调函数。

让我们来看一**册的ep_poll_callback:

static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)  
{  
    int pwake = 0;  
    unsigned long flags;  
    struct epitem *epi = ep_item_from_wait(wait);  
    struct eventpoll *ep = epi->ep;  
    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p/n",  
             current, epi->ffd.file, epi, ep));  
    /* 对eventpoll的spinlock加锁，因为是在中断上下文中*/  
    spin_lock_irqsave(&ep->lock, flags);  
    /* 没有事件到来  
     * If the event mask does not contain any poll(2) event, we consider the  
     * descriptor to be disabled. This condition is likely the effect of the  
     * EPOLLONESHOT bit that disables the descriptor when an event is received,  
     * until the next EPOLL_CTL_MOD will be issued.  
     */  
    if (!(epi->event.events & ~EP_PRIVATE_BITS))  
        goto out_unlock;  
    /*  
     * If we are trasfering events to userspace, we can hold no locks  
     * (because we're accessing user memory, and because of linux f_op->poll()  
     * semantics). All the events that happens during that period of time are  
     * chained in ep->ovflist and requeued later on.  
     */  
    if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {  
        if (epi->next == EP_UNACTIVE_PTR) {  
            epi->next = ep->ovflist;  
            ep->ovflist = epi;  
        }  
        goto out_unlock;  
    }  
    /* If this file is already in the ready list we exit soon */  
    if (ep_is_linked(&epi->rdllink))  
        goto is_linked;  
        /* 加入ready queue*/  
    list_add_tail(&epi->rdllink, &ep->rdllist);  
is_linked:  
    /*  
     * Wake up ( if active ) both the eventpoll wait list and the ->poll()  
     * wait list.  
     */  
    if (waitqueue_active(&ep->wq))  
        wake_up_locked(&ep->wq);  
    if (waitqueue_active(&ep->poll_wait))  
        pwake++;  
out_unlock:  
    spin_unlock_irqrestore(&ep->lock, flags);  
    /* We have to call this outside the lock */  
    if (pwake)  
        ep_poll_safewake(&psw, &ep->poll_wait);  
    return 1;  
}

这里有两个队列，一个是epoll_wait调用中使用的eventpoll等待队列，用于判断是否有监听套接字可用；另一个对应于每个套接字的等待队列sk_sleep，用于判断每个监听套接字上的时间，该队列唤醒后调用ep_poll_callback，在该函数中又调用wakeup函数来唤醒前一种队列，来通知epoll_wait调用进程。

epoll_wait中调用下面的等待函数，一旦其被ep_poll_callback唤醒，则调用ep_send_events把事件复制到用户控件，进而epoll_wait返回。

static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,  
           int maxevents, long timeout)  
{  
    int res, eavail;  
    unsigned long flags;  
    long jtimeout;  
    wait_queue_t wait;  
    /*  
     * Calculate the timeout by checking for the "infinite" value ( -1 )  
     * and the overflow condition. The passed timeout is in milliseconds,  
     * that why (t * HZ) / 1000.  
     */  
    jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ?  
        MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;  
retry:  
    spin_lock_irqsave(&ep->lock, flags);  
    res = 0;  
    if (list_empty(&ep->rdllist)) {  
        /*  
         * We don't have any available event to return to the caller.  
         * We need to sleep here, and we will be wake up by  
         * ep_poll_callback() when events will become available.  
         */  
        init_waitqueue_entry(&wait, current);  
        wait.flags |= WQ_FLAG_EXCLUSIVE;  
        __add_wait_queue(&ep->wq, &wait);  
        for (;;) {  
            /*  
             * We don't want to sleep if the ep_poll_callback() sends us  
             * a wakeup in between. That's why we set the task state  
             * to TASK_INTERRUPTIBLE before doing the checks.  
             */  
            set_current_state(TASK_INTERRUPTIBLE);  
            if (!list_empty(&ep->rdllist) || !jtimeout)  
                break;  
            if (signal_pending(current)) {  
                res = -EINTR;  
                break;  
            }  
            spin_unlock_irqrestore(&ep->lock, flags);  
            jtimeout = schedule_timeout(jtimeout);  
            spin_lock_irqsave(&ep->lock, flags);  
        }  
        __remove_wait_queue(&ep->wq, &wait);  
        set_current_state(TASK_RUNNING);  
    }  
    /* Is it worth to try to dig for events ? */  
    eavail = !list_empty(&ep->rdllist);  
    spin_unlock_irqrestore(&ep->lock, flags);  
    /*  
     * Try to transfer events to user space. In case we get 0 events and  
     * there's still timeout left over, we go trying again in search of  
     * more luck.  
     */  
    if (!res && eavail &&  
        !(res = ep_send_events(ep, events, maxevents)) && jtimeout)  
        goto retry;  
    return res;  
}

使用epoll的整个流程可以总结如下（copy来的）：

struct sock结构

该结构主要对应于一个socket描述符，通过上面的讲述再根据sock的结构，我们大致可以勾勒出一个事件到来到事件返回的全过程，涉及到Linux网络协议栈的等后面学习了再补充，当前仅贴出该数据结构：

struct sock {  
        struct sock_common  __sk_common;  
#define sk_family       __sk_common.skc_family  
#define sk_state        __sk_common.skc_state  
#define sk_reuse        __sk_common.skc_reuse  
#define sk_bound_dev_if     __sk_common.skc_bound_dev_if  
#define sk_node         __sk_common.skc_node  
#define sk_bind_node        __sk_common.skc_bind_node  
#define sk_refcnt       __sk_common.skc_refcnt  
        unsigned char       sk_shutdown : 2,  
                            sk_no_check : 2,  
                            sk_userlocks : 4;  
        unsigned char       sk_protocol;  
        unsigned short      sk_type;  
        int         sk_rcvbuf;  
        socket_lock_t       sk_lock;  
        wait_queue_head_t   *sk_sleep;  
        struct dst_entry    *sk_dst_cache;  
        struct xfrm_policy  *sk_policy[2];  
        rwlock_t        sk_dst_lock;  
        atomic_t        sk_rmem_alloc;  
        atomic_t        sk_wmem_alloc;  
        atomic_t        sk_omem_alloc;  
        struct sk_buff_head sk_receive_queue;  
        struct sk_buff_head sk_write_queue;  
        int         sk_wmem_queued;  
        int         sk_forward_alloc;  
        unsigned int        sk_allocation;  
        int         sk_sndbuf;  
        int         sk_route_caps;  
        int         sk_hashent;  
        unsigned long       sk_flags;  
        unsigned long           sk_lingertime;  
  
        struct {  
            struct sk_buff *head;  
            struct sk_buff *tail;  
        } sk_backlog;  
        struct sk_buff_head sk_error_queue;  
        struct proto        *sk_prot;  
        struct proto        *sk_prot_creator;  
        rwlock_t        sk_callback_lock;  
        int         sk_err,  
                    sk_err_soft;  
        unsigned short      sk_ack_backlog;  
        unsigned short      sk_max_ack_backlog;  
        __u32           sk_priority;  
        struct ucred        sk_peercred;  
        int         sk_rcvlowat;  
        long            sk_rcvtimeo;  
        long            sk_sndtimeo;  
        struct sk_filter        *sk_filter;  
        void            *sk_protinfo;  
        struct timer_list   sk_timer;  
        struct timeval      sk_stamp;  
        struct socket       *sk_socket;  
        void            *sk_user_data;  
        struct page     *sk_sndmsg_page;  
        struct sk_buff      *sk_send_head;  
        __u32           sk_sndmsg_off;  
        int         sk_write_pending;  
        void            *sk_security;  
        void            (*sk_state_change)(struct sock *sk);//状态改变时调用  
        void            (*sk_data_ready)(struct sock *sk, int bytes);//有数据可读时调用，即读事件  
        void            (*sk_write_space)(struct sock *sk);//有数据可写时调用，即写事件  
        void            (*sk_error_report)(struct sock *sk);//套接字错误时调用  
        int             (*sk_backlog_rcv)(struct sock *sk,  
                        struct sk_buff *skb);       
        void            (*sk_destruct)(struct sock *sk);//套接字被释放时调用。  
    };

在该结构的最后有几个回调函数，其主要的作用就是在套接字状态发生变化时执行的回调，见名思意。

epoll使用范例

int epoll_create(int size); //size 表示要监听的文件描述符数量  
int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);   
int epoll_wait(int epfd, struct epoll_event* events, int maxevents, int timeout); // timeout = -1 阻塞; timeout = 0 立即返回

epoll_event的结构如下：

typedef union epoll_data {  
    void *ptr;  
    int fd;  
    __uint32_t u32;  
    __uint64_t u64;  
} epoll_data_t;    
struct epoll_event {  
    __uint32_t events; /* Epoll events */  
    epoll_data_t data; /* User data variable */  
};

events可以是一下几个标志的集合：

EPOLLIN ：表示对应的文件描述符可以读（包括对端SOCKET正常关闭）；

EPOLLOUT：表示对应的文件描述符可以写；

EPOLLPRI：表示对应的文件描述符有紧急的数据可读（这里应该表示有带外数据到来）；

EPOLLERR：表示对应的文件描述符发生错误；

EPOLLHUP：表示对应的文件描述符被挂断；

EPOLLET：将EPOLL设为边缘触发(Edge Triggered)模式，这是相对于水平触发(Level Triggered)来说的。

EPOLLONESHOT：只监听一次事件，当监听完这次事件之后，如果还需要继续监听这个socket的话，需要再次把这个socket加入到EPOLL队列里

而epoll_ctl的op参数可以为一下值：

EPOLL_CTL_ADD：注册新的fd到epfd中；

EPOLL_CTL_MOD：修改已经注册的fd的监听事件；

EPOLL_CTL_DEL：从epfd中删除一个fd；

随手附上一个简单的例子，不提供任何保证：

#define MAX_CONNECTION 1024  
struct self_define_data{  
    int data;  
  
};  
  
int main(int argc, char* argv[]){  
  
    int listen_fd,client_fd,flag;  
  
    struct sockaddr_in server,client;  
    struct epoll_event ee,event_list[20];  
  
    
    listen_fd = socket(AF_INET,SOCK_STREAM,0);  
    /* 
    flag = fcntl(listen_fd,F_GETFL,0); 
    flag |= O_NONBLOCK; 
    if(fcntl(listen_fd,F_SETFL,flag) < 0){ 
        perror("set non_block failed"); 
        return -1; 
    } 
    */  
    ioctl(listen_fd,FIONBIO,&n);  
    
    bzero(&server,sizeof(struct sockaddr_in));  
    server.sin_family = AF_INET;  
    inet_aton("127.0.0.1",&server.sin_addr);  
    server.sin_port = htons(80);  
    bind(listen_fd, (struct sockaddr*)&server, sizeof(struct sockaddr));  
    listen(listen_fd,5);  
  
  
  
  
    int ep = epoll_create(MAX_CONNECTION); // int cycle->connection  
    if(-1 == ep){  
        perror("epoll_create failed.");  
        return -1;  
    }  
    
    struct self_define_data *self;  
    self->data = 10;  
  
    ee.events = EPOLLIN|EPOLLOUT|EPOLLET;  
    ee.data.ptr = (void*)self;  
    ee.data.fd = listen_fd;   
    epoll_ctl(ep,EPOLL_CTL_ADD,listen_fd, &ee);  
  
    while(1){  
        
        int num = epoll_wait(ep,event_list,20,-1);  
        for(int i = 0; i < num; i++){  
            struct sef_define_data s = event_list[i].data.ptr;  
            if(10 == s.data){  
                uint32_t revent = event_list[i].events;//revent中包含返回的事件如EPOLLIN  
                if(event_list[i].data.fd == listen_fd){  
                    client_fd = accept(listen_fd,(struct sockaddr*)&client,sizeof(struct sockaddr));  
                    //do someting  
                    if(client_fd < 0)  
                        break;  
                    close(client_fd);  
                }  
  
            }   
        }  
  
    }  
  
    if(-1 == close(ep)){  
        perror("epoll_close failed.");  
        return -1;  
    }  
  
  
    return 0;  
}

Linux加油站

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
epoll的内核实现

ep_ptable_queue_proc函数的主要作用是注册等待函数，并添加到指定的等待队列，第一次调用后此信息已经存在，无需在poll函数中在此调用了。epoll_ctl 接口加入该epoll描述符监听的套接字属于socket filesystem，每一个都对应一个epitem结构体，该结构以红黑树的方式存储，eventpoll中的rbr成员指向该红黑树的root节点，而有监听事件到来的套接字结构以双向连表的形式保存，其头结点对应eventpoll中的rdllist成员。代码的逻辑很简单，注释也很清楚。
复制链接

扫一扫

epoll的内核实现

“相关推荐”对你有帮助么？