嵌入式软件开发之------浅析 linux epoll（十四）

最新推荐文章于 2024-01-02 14:43:40 发布

surquer

最新推荐文章于 2024-01-02 14:43:40 发布

阅读量297

点赞数

分类专栏： linux epoll 文章标签： linux epoll

本文链接：https://blog.csdn.net/lujian186/article/details/113822478

版权

linux 同时被 2 个专栏收录

1 篇文章 0 订阅

订阅专栏

epoll

1 篇文章 0 订阅

订阅专栏

linux代码版本：linux4.4

导读：在监控大量 fd 的时候， select 和 poll 有着明显的缺点：1. copy 所有的 fd 到内核 2. 活跃数不多的时候轮询方式效率低 3. 无法精确产生事件的 fd 。而 epoll 对这几个缺点有明显的改进：1. 开始的时候就将 fd 传递给内核，监控的时候不需要再 copy 到内核 2. 采用 event 的方式 3. 将产生事件的 fd 放入链表，直接查询该链表就行了。

一、epoll

先看一下 epoll 的几个接口函数：

1. epoll_create(int size);

创建 epoll 句柄，size 为监控 fd 数量的最大值

2. int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);

epoll 事件注冊函数。

epfd是 epoll_create 产生的句柄
op表示动作：用三个宏表示：
EPOLL_CTL_ADD：注冊新的fd到epfd中；
EPOLL_CTL_MOD：改动已经注冊的fd的监听事件；
EPOLL_CTL_DEL：从epfd中删除一个fd；
fd 就是监听的fd

events可以是下面几种的组合：
EPOLLIN ：相应的 fd 能够读
EPOLLOUT：相应的 fd 能够写
EPOLLPRI：相应的 fd 有紧急的数据可读（这里应该表示有带外数据到来）；
EPOLLERR：相应的 fd 发生错误；
EPOLLHUP：相应的 fd 发生错误挂断。
EPOLLET：将EPOLL设为边缘触发(Edge Triggered)模式。这是相对于水平触发(Level Triggered)来说的。
EPOLLONESHOT：仅仅监听一次事件。当监听完这次事件之后，就会把这个fd从epoll的队列中删除。

3. int epoll_wait(int epfd, struct epoll_event *events, int maxevents, int timeout);

等待事件的产生，并将产生的事件放到 events ，避免像 select 和 poll 一样还需要遍历查询都哪些 fd 上有事件

二、epoll 函数分析

SYSCALL_DEFINE1(epoll_create, int, size)
{
    /*就判断一下非负值就没用了？*/
	if (size <= 0)
		return -EINVAL;

	return sys_epoll_create1(0);
}

可以看到，size 参数只要不是负值，也没什么用，并没有向下传递，接下来看 sys_epoll_create1

SYSCALL_DEFINE1(epoll_create1, int, flags)
{
	int error, fd;
	struct eventpoll *ep = NULL;
	struct file *file;

	/* Check the EPOLL_* constant for consistency.  */
	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
    /*由 epoll_creat 传递的 0 */
	if (flags & ~EPOLL_CLOEXEC)
		return -EINVAL;
	/*
	 * Create the internal data structure ("struct eventpoll").
	 */
	/*申请 eventpoll 结构体并初始化*/
	error = ep_alloc(&ep);
	if (error < 0)
		return error;
	/*
	 * Creates all the items needed to setup an eventpoll file. That is,
	 * a file structure and a free file descriptor.
	 */
	/*从当前进程的文件表中取一个未用的 fd ，并设置可读写的权限*/
	fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
	if (fd < 0) {
		error = fd;
		goto out_free_ep;
	}
    /*再匿名文件系统中创建一个匿名文件（ inode） ，并返回 file 结构体*/
	file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
				 O_RDWR | (flags & O_CLOEXEC));
           {
            	struct qstr this;
            	struct path path;
            	struct file *file;
                /*匿名文件系统初始化后就为合法值*/
            	if (IS_ERR(anon_inode_inode))
            		return ERR_PTR(-ENODEV);
                /* fops 是不是很眼熟，驱动的工作不就是填充它的成员吗，这里直接用的 eventpoll_fops */
            	if (fops->owner && !try_module_get(fops->owner))
            		return ERR_PTR(-ENOENT);

            	/*
            	 * Link the inode to a directory entry by creating a unique name
            	 * using the inode sequence number.
            	 */
            	file = ERR_PTR(-ENOMEM);
            	this.name = name;
            	this.len = strlen(name);
            	this.hash = 0;
                /*inode 有了 ，再申请一个目录项 dentry */
            	path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this);
            	if (!path.dentry)
            		goto err_module;

            	path.mnt = mntget(anon_inode_mnt);
            	/*
            	 * We know the anon_inode inode count is always greater than zero,
            	 * so ihold() is safe.
            	 */
            	ihold(anon_inode_inode);
                /*denty 和 inode 关联*/
            	d_instantiate(path.dentry, anon_inode_inode);
                /*申请并填充 file 结构体*/
            	file = alloc_file(&path, OPEN_FMODE(flags), fops);
            	if (IS_ERR(file))
            		goto err_dput;
            	file->f_mapping = anon_inode_inode->i_mapping;

            	file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
            	file->private_data = priv;

            	return file;

            err_dput:
            	path_put(&path);
            err_module:
            	module_put(fops->owner);
            	return file;
            }
	if (IS_ERR(file)) {
		error = PTR_ERR(file);
		goto out_free_fd;
	}
    /*将file 赋值给 eventpoll */
	ep->file = file;
	fd_install(fd, file);
	return fd;

out_free_fd:
	put_unused_fd(fd);
out_free_ep:
	ep_free(ep);
	return error;
}

这段代码就非常有意思，

1. 先获取一个进程未使用的 fd ，作为 epoll_creat 返回值

2. 在匿名文件系统中创建一个 eventpoll 的文件（创建 dentry 并链接 inode），再分配 file 结构体并填充相关内容，fops 用的是 eventpoll_fops

3. fd 和 file 关联

4. eventpoll 是一个非常重要的结构体，作为私有结构赋值给 file

整个过程就类似于 open 函数创建一个 eventpoll 文件并打开，返回 fd ，可以简单认为，epoll_creat 再匿名文件系统中创建了一个 eventpoll 文件并打开。

下面看 epoll_ctl ：

SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
		struct epoll_event __user *, event)
{
	int error;
	int full_check = 0;
	struct fd f, tf;
	struct eventpoll *ep;
	struct epitem *epi;
	struct epoll_event epds;
	struct eventpoll *tep = NULL;

	error = -EFAULT;
    /*只要不是 EPOLL_CTL_DEL ，就先 copy 进来*/
	if (ep_op_has_event(op) &&
	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
		goto error_return;

	error = -EBADF;
    /*根据 epfd 获取 file */
	f = fdget(epfd);
	if (!f.file)
		goto error_return;

	/* Get the "struct file *" for the target file */
    /*再获取要操作 fd 的 file */
	tf = fdget(fd);
	if (!tf.file)
		goto error_fput;

	/* The target file descriptor must support poll */
	error = -EPERM;
    /*要监听 fd 的 f_op->poll 不能为 NULL，或者说驱动必须挂接了 poll 成员*/
	if (!tf.file->f_op->poll)
		goto error_tgt_fput;

	/* Check if EPOLLWAKEUP is allowed */
	if (ep_op_has_event(op))
        /*将 EPOLLWAKEUP 标志清除了 */
		ep_take_care_of_epollwakeup(&epds);

	/*
	 * We have to check that the file structure underneath the file descriptor
	 * the user passed to us _is_ an eventpoll file. And also we do not permit
	 * adding an epoll file descriptor inside itself.
	 */
	error = -EINVAL;
    /*上面的注释写的很清楚 ，不能把 epfd 给添加进来*/
	if (f.file == tf.file || !is_file_epoll(f.file))
		goto error_tgt_fput;

	/*
	 * At this point it is safe to assume that the "private_data" contains
	 * our own data structure.
	 */
	/*获取到 eventpoll ，在 epoll_creat 时挂接的，驱动开发中也常将一些私有结构赋值给 file->private_data */
	ep = f.file->private_data;

	/*
	 * When we insert an epoll file descriptor, inside another epoll file
	 * descriptor, there is the change of creating closed loops, which are
	 * better be handled here, than in more critical paths. While we are
	 * checking for loops we also determine the list of files reachable
	 * and hang them on the tfile_check_list, so we can check that we
	 * haven't created too many possible wakeup paths.
	 *
	 * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
	 * the epoll file descriptor is attaching directly to a wakeup source,
	 * unless the epoll file descriptor is nested. The purpose of taking the
	 * 'epmutex' on add is to prevent complex toplogies such as loops and
	 * deep wakeup paths from forming in parallel through multiple
	 * EPOLL_CTL_ADD operations.
	 */
	mutex_lock_nested(&ep->mtx, 0);
    /*添加 fd */
	if (op == EPOLL_CTL_ADD) {

        /*epfd 的 f_ep_links 不为 NULL 或者传递进来的 fd 是 epfd */
		if (!list_empty(&f.file->f_ep_links) ||
						is_file_epoll(tf.file)) {
			full_check = 1;
			mutex_unlock(&ep->mtx);
			mutex_lock(&epmutex);
            /*添加的 fd 不能时 epoll 的fd*/
			if (is_file_epoll(tf.file)) {
				error = -ELOOP;
				if (ep_loop_check(ep, tf.file) != 0) {
					clear_tfile_check_list();
					goto error_tgt_fput;
				}
			} else
			/* 将添加的 fd 对应的file 插入到 tfile_check_list */ 
				list_add(&tf.file->f_tfile_llink,
							&tfile_check_list);
			mutex_lock_nested(&ep->mtx, 0);
			if (is_file_epoll(tf.file)) {
				tep = tf.file->private_data;
				mutex_lock_nested(&tep->mtx, 1);
			}
		}
	}

	/*
	 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
	 * above, we can be sure to be able to use the item looked up by
	 * ep_find() till we release the mutex.
	 */
	/*从 eventpoll 中找到传递进来 fd 对应的 epitem */
	epi = ep_find(ep, tf.file, fd);

	error = -EINVAL;
	switch (op) {
	case EPOLL_CTL_ADD:
        /*既然要add就说明之前不存在 epitem ，否则重复添加，报错*/
		if (!epi) {
			epds.events |= POLLERR | POLLHUP;
			error = ep_insert(ep, &epds, tf.file, fd, full_check);
            {
            	int error, revents, pwake = 0;
            	unsigned long flags;
            	long user_watches;
            	struct epitem *epi;
            	struct ep_pqueue epq;
                /*获取当前 eventpoll 监控的数量 */
            	user_watches = atomic_long_read(&ep->user->epoll_watches);
            	if (unlikely(user_watches >= max_user_watches))
            		return -ENOSPC;
                /*创建 eventpoll 条目*/
            	if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
            		return -ENOMEM;

            	/* Item initialization follow here ... */
                /*就绪链表*/
            	INIT_LIST_HEAD(&epi->rdllink);
                /*file 链表 */
            	INIT_LIST_HEAD(&epi->fllink);
                /* wait 链表*/
            	INIT_LIST_HEAD(&epi->pwqlist);
            	epi->ep = ep;
            	ep_set_ffd(&epi->ffd, tfile, fd);
            	epi->event = *event;
            	epi->nwait = 0;
            	epi->next = EP_UNACTIVE_PTR;
            	if (epi->event.events & EPOLLWAKEUP) {
                    /*申请 wakeupsource */
            		error = ep_create_wakeup_source(epi);
            		if (error)
            			goto error_create_wakeup_source;
            	} else {
            		RCU_INIT_POINTER(epi->ws, NULL);
            	}

            	/* Initialize the poll table using the queue callback */
            	epq.epi = epi;
                /*驱动实现 poll 的时候调用 poll_wait 进而调用 ep_ptable_queue_proc */
            	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
                                           {
                                                /*创建 eppoll_entry 并加入等待队列头，同时加入 epi->pwqlist */
                                            	struct epitem *epi = ep_item_from_epqueue(pt);
                                            	struct eppoll_entry *pwq;

                                            	if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
                                            		init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
                                            		pwq->whead = whead;
                                            		pwq->base = epi;
                                            		add_wait_queue(whead, &pwq->wait);
                                            		list_add_tail(&pwq->llink, &epi->pwqlist);
                                            		epi->nwait++;
                                            	} else {
                                            		/* We have to signal that an error occurred */
                                            		epi->nwait = -1;
                                            	}
                                            }

            	/*
            	 * Attach the item to the poll hooks and get current event bits.
            	 * We can safely use the file* here because its usage count has
            	 * been increased by the caller of this function. Note that after
            	 * this operation completes, the poll callback can start hitting
            	 * the new item.
            	 */
            	/*调用驱动实现的 poll ，并返回结果*/
            	revents = ep_item_poll(epi, &epq.pt);
                          {
                            	pt->_key = epi->event.events;

                            	return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events;
                           }

            	/*
            	 * We have to check if something went wrong during the poll wait queue
            	 * install process. Namely an allocation for a wait queue failed due
            	 * high memory pressure.
            	 */
            	error = -ENOMEM;
            	if (epi->nwait < 0)
            		goto error_unregister;

            	/* Add the current item to the list of active epoll hook for this file */
            	spin_lock(&tfile->f_lock);
            	list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
            	spin_unlock(&tfile->f_lock);

            	/*
            	 * Add the current item to the RB tree. All RB tree operations are
            	 * protected by "mtx", and ep_insert() is called with "mtx" held.
            	 */
            	ep_rbtree_insert(ep, epi);

            	/* now check if we've created too many backpaths */
            	error = -EINVAL;
            	if (full_check && reverse_path_check())
            		goto error_remove_epi;

            	/* We have to drop the new item inside our item list to keep track of it */
            	spin_lock_irqsave(&ep->lock, flags);

            	/* If the file is already "ready" we drop it inside the ready list */
                /*如果有就绪的epi       插入到 eventpoll 的就绪链表中*/
            	if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
            		list_add_tail(&epi->rdllink, &ep->rdllist);
            		ep_pm_stay_awake(epi);

            		/* Notify waiting tasks that events are available */
            		if (waitqueue_active(&ep->wq))
            			wake_up_locked(&ep->wq);
            		if (waitqueue_active(&ep->poll_wait))
            			pwake++;
            	}

            	spin_unlock_irqrestore(&ep->lock, flags);

            	atomic_long_inc(&ep->user->epoll_watches);

            	/* We have to call this outside the lock */
            	if (pwake)
            		ep_poll_safewake(&ep->poll_wait);

            	return 0;

            error_remove_epi:
            	spin_lock(&tfile->f_lock);
            	list_del_rcu(&epi->fllink);
            	spin_unlock(&tfile->f_lock);

            	rb_erase(&epi->rbn, &ep->rbr);

            error_unregister:
            	ep_unregister_pollwait(ep, epi);

            	/*
            	 * We need to do this because an event could have been arrived on some
            	 * allocated wait queue. Note that we don't care about the ep->ovflist
            	 * list, since that is used/cleaned only inside a section bound by "mtx".
            	 * And ep_insert() is called with "mtx" held.
            	 */
            	spin_lock_irqsave(&ep->lock, flags);
            	if (ep_is_linked(&epi->rdllink))
            		list_del_init(&epi->rdllink);
            	spin_unlock_irqrestore(&ep->lock, flags);

            	wakeup_source_unregister(ep_wakeup_source(epi));

            error_create_wakeup_source:
            	kmem_cache_free(epi_cache, epi);

            	return error;
            }
		} else
			error = -EEXIST;
		if (full_check)
			clear_tfile_check_list();
		break;
	case EPOLL_CTL_DEL:
		if (epi)
			error = ep_remove(ep, epi);
		else
			error = -ENOENT;
		break;
	case EPOLL_CTL_MOD:
		if (epi) {
			epds.events |= POLLERR | POLLHUP;
			error = ep_modify(ep, epi, &epds);
		} else
			error = -ENOENT;
		break;
	}
	if (tep != NULL)
		mutex_unlock(&tep->mtx);
	mutex_unlock(&ep->mtx);

error_tgt_fput:
	if (full_check)
		mutex_unlock(&epmutex);

	fdput(tf);
error_fput:
	fdput(f);
error_return:

	return error;
}

简单总结以下：

1. 根据 epfd、fd 获取到相应的 file

2. 将 epoll_event copy 到内核

3. 要保证传递的 fd 不能是 epoll 的 fd ，且其 file->f_op->poll 不能为 NULL

4. 查询 fd 对应的 epi(每添加一个 fd ，为其分配一个 epi 结构体及 ep_pqueue ) ，

5. EPOLL_CTL_ADD 为添加的 fd 创建对应的 epi 及 eppoll_entry，调用驱动 poll 时，将 eppoll_entry 添加到文件的等待队列头，其 eppoll_entry 回调函数为 ep_poll_callback ，

6. 接下来调用驱动对应的 poll 函数，得到返回值，如果有事件，将 epi->rdllink （可通过 epi 获得 fd 及其 event ）插入到 ep->rdllist ，同时wakeup 因sys_epoll_wait（ep->wq）和 by file->poll （ep->poll_wait）而休眠的进程(其实就是调用 epoll_wait 的进程)

下面看一个非常关键的函数 ep_poll_callback ，当监控文件有事件产生时，会被 wake up，并调用等待队列的回调函数 ep_poll_callback

static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
	int pwake = 0;
	unsigned long flags;
	struct epitem *epi = ep_item_from_wait(wait);   //由等待队列获取到 epi 
	struct eventpoll *ep = epi->ep;   //获取到 eventpoll

	spin_lock_irqsave(&ep->lock, flags);

	/*
	 * If the event mask does not contain any poll(2) event, we consider the
	 * descriptor to be disabled. This condition is likely the effect of the
	 * EPOLLONESHOT bit that disables the descriptor when an event is received,
	 * until the next EPOLL_CTL_MOD will be issued.
	 */
	if (!(epi->event.events & ~EP_PRIVATE_BITS))
		goto out_unlock;

	/*
	 * Check the events coming with the callback. At this stage, not
	 * every device reports the events in the "key" parameter of the
	 * callback. We need to be able to handle both cases here, hence the
	 * test for "key" != NULL before the event match test.
	 */
	if (key && !((unsigned long) key & epi->event.events))
		goto out_unlock;

	/*
	 * If we are transferring events to userspace, we can hold no locks
	 * (because we're accessing user memory, and because of linux f_op->poll()
	 * semantics). All the events that happen during that period of time are
	 * chained in ep->ovflist and requeued later on.
	 */
	/* 将epi 插入到 ep->ovflist */
	if (ep->ovflist != EP_UNACTIVE_PTR) {
		if (epi->next == EP_UNACTIVE_PTR) {
			epi->next = ep->ovflist;
			ep->ovflist = epi;
			if (epi->ws) {
				/*
				 * Activate ep->ws since epi->ws may get
				 * deactivated at any time.
				 */
				__pm_stay_awake(ep->ws);
			}

		}
		goto out_unlock;
	}

	/* If this file is already in the ready list we exit soon */
   /*epi 掺入到 ep 的就绪链表*/
	if (!ep_is_linked(&epi->rdllink)) {
		list_add_tail(&epi->rdllink, &ep->rdllist);
		ep_pm_stay_awake_rcu(epi);
	}

	/*
	 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
	 * wait list.
	 */
	 /* 激活 ep 的等待队列 */
	if (waitqueue_active(&ep->wq))
		wake_up_locked(&ep->wq);
	if (waitqueue_active(&ep->poll_wait))
		pwake++;

out_unlock:
	spin_unlock_irqrestore(&ep->lock, flags);

	/* We have to call this outside the lock */
	if (pwake)
		ep_poll_safewake(&ep->poll_wait);

    /*查询到就删除的情况*/
	if ((unsigned long)key & POLLFREE) {
		/*
		 * If we race with ep_remove_wait_queue() it can miss
		 * ->whead = NULL and do another remove_wait_queue() after
		 * us, so we can't use __remove_wait_queue().
		 */
		list_del_init(&wait->task_list);
		/*
		 * ->whead != NULL protects us from the race with ep_free()
		 * or ep_remove(), ep_remove_wait_queue() takes whead->lock
		 * held by the caller. Once we nullify it, nothing protects
		 * ep/epi or even wait.
		 */
		smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
	}

	return 1;
}

从代码中可知，当监控的 fd 有事件发生，竟会wakeup 等待队列，然后调用 ep_poll_callback ，而该函数总结下来就做了一件事，就是将对应的 epi 插入到 eventpoll 的就绪链表中。

猜测 epoll_wait 的主要任务就是监控 eventpoll 的就绪链表，然后将相应的 epi 转发为 event 输出给用户程序。

下面看 epoll_wait ：

SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
		int, maxevents, int, timeout)
{
	int error;
	struct fd f;
	struct eventpoll *ep;

	/* The maximum number of event must be greater than zero */
    /*判断监控的最大 event 数合法 */
	if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
		return -EINVAL;

	/* Verify that the area passed by the user is writeable */
    /*必须验证用户空间的可写属性或者说合法性*/
	if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))
		return -EFAULT;

	/* Get the "struct file *" for the eventpoll file */
    /*过去 file 结构体*/
	f = fdget(epfd);
	if (!f.file)
		return -EBADF;

	/*
	 * We have to check that the file structure underneath the fd
	 * the user passed to us _is_ an eventpoll file.
	 */
	error = -EINVAL;
    /* 必须是 epoll 的 fd */
	if (!is_file_epoll(f.file))
		goto error_fput;

	/*
	 * At this point it is safe to assume that the "private_data" contains
	 * our own data structure.
	 */
	 /* 获取 eventpoll ，epoll_creat 时分配的 */
	ep = f.file->private_data;

	/* Time to fish for events ... */
	error = ep_poll(ep, events, maxevents, timeout);
            {
            	int res = 0, eavail, timed_out = 0;
            	unsigned long flags;
            	u64 slack = 0;
            	wait_queue_t wait;
            	ktime_t expires, *to = NULL;

            	if (timeout > 0) {
            		struct timespec end_time = ep_set_mstimeout(timeout);

            		slack = select_estimate_accuracy(&end_time);
            		to = &expires;
            		*to = timespec_to_ktime(end_time);
            	} else if (timeout == 0) {
            		/*
            		 * Avoid the unnecessary trip to the wait queue loop, if the
            		 * caller specified a non blocking operation.
            		 */
            		/*非阻塞式访问，不用添加等待队列，检查完就返回*/
            		timed_out = 1;
            		spin_lock_irqsave(&ep->lock, flags);
            		goto check_events;
            	}

            fetch_events:
            	spin_lock_irqsave(&ep->lock, flags);

            	if (!ep_events_available(ep)) {
            		/*
            		 * We don't have any available event to return to the caller.
            		 * We need to sleep here, and we will be wake up by
            		 * ep_poll_callback() when events will become available.
            		 */
            		 /*将当前进程添加到等待队列 ep->wq */
            		init_waitqueue_entry(&wait, current);
            		__add_wait_queue_exclusive(&ep->wq, &wait);

                    /*要么事件超时，要么被事件唤醒（epoll_ctl 和 ep_poll_callback 都会唤醒）*/
            		for (;;) {
            			/*
            			 * We don't want to sleep if the ep_poll_callback() sends us
            			 * a wakeup in between. That's why we set the task state
            			 * to TASK_INTERRUPTIBLE before doing the checks.
            			 */
            			set_current_state(TASK_INTERRUPTIBLE);
            			if (ep_events_available(ep) || timed_out)
            				break;
            			if (signal_pending(current)) {
            				res = -EINTR;
            				break;
            			}

            			spin_unlock_irqrestore(&ep->lock, flags);
            			if (!freezable_schedule_hrtimeout_range(to, slack,
            								HRTIMER_MODE_ABS))
            				timed_out = 1;

            			spin_lock_irqsave(&ep->lock, flags);
            		}

            		__remove_wait_queue(&ep->wq, &wait);
            		__set_current_state(TASK_RUNNING);
            	}
            check_events:
            	/* Is it worth to try to dig for events ? */
            	eavail = ep_events_available(ep);

            	spin_unlock_irqrestore(&ep->lock, flags);

            	/*
            	 * Try to transfer events to user space. In case we get 0 events and
            	 * there's still timeout left over, we go trying again in search of
            	 * more luck.
            	 */
            	 /*有就绪的事件（eventpoll 就绪队列不为 空），将epi 转换发送到用户空间的 events */
            	if (!res && eavail &&
            	    !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
            		goto fetch_events;

            	return res;
            }

error_fput:
	fdput(f);
	return error;
}

从上面的代码可知，除了一些参数合法性判断，其实就是检查 eventpoll 就绪链表（ep->rdllist）是否有epi ，然后将 epi 转换成 events 再输出给用户态。

总结：做过底层开发的人都知道，通常对于硬件事件有 查询法 和中断两种方式，select和poll 就类似查询法，而 epoll 则类似中断法。而实际底层开发过程中，一般也是用中断的方法处理硬件事件，

避免了 CPU 一次次的白转。

surquer

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
嵌入式软件开发之------浅析 linux epoll（十四）

linux代码版本：linux4.4导读：在监控大量 fd 的时候，select 和 poll 有着明显的缺点：1. copy 所有的 fd 到内核 2. 活跃数不多的时候轮询方式效率低 3. 无法精确产生事件的 fd 。而 epoll 对这几个缺点有明显的改进：1. 开始的时候就将 fd 传递给内核，监控的时候不需要再 copy 到内核 2. 采用 event 的方式 3. 将产生事件的 fd 放入链表，直接查询该链表就行了。一、epoll先看一下 epoll 的几个接口函数：1. ...
复制链接

扫一扫

专栏目录