嵌入式软件开发之------浅析 linux epoll(十四)

linux代码版本:linux4.4

导读:在监控大量 fd 的时候, select 和 poll 有着明显的缺点:1. copy 所有的 fd 到内核 2. 活跃数不多的时候轮询方式效率低 3. 无法精确产生事件的 fd 。而 epoll 对这几个缺点有明显的改进:1. 开始的时候就将 fd 传递给内核,监控的时候不需要再 copy 到内核  2. 采用 event 的方式 3. 将产生事件的 fd 放入链表,直接查询该链表就行了。

一、epoll 

先看一下 epoll 的几个接口函数:

1. epoll_create(int size);

创建 epoll 句柄,size 为监控 fd 数量的最大值

2. int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);

epoll 事件注冊函数。

epfd是 epoll_create 产生的句柄
op表示动作:用三个宏表示:
EPOLL_CTL_ADD:注冊新的fd到epfd中;
EPOLL_CTL_MOD:改动已经注冊的fd的监听事件;
EPOLL_CTL_DEL:从epfd中删除一个fd;
fd 就是监听的fd 

events可以是下面几种的组合:
EPOLLIN :相应的 fd 能够读
EPOLLOUT:相应的 fd 能够写
EPOLLPRI:相应的 fd 有紧急的数据可读(这里应该表示有带外数据到来);
EPOLLERR:相应的 fd 发生错误;
EPOLLHUP:相应的 fd 发生错误挂断。
EPOLLET: 将EPOLL设为边缘触发(Edge Triggered)模式。这是相对于水平触发(Level Triggered)来说的。
EPOLLONESHOT:仅仅监听一次事件。当监听完这次事件之后,就会把这个fd从epoll的队列中删除。

3. 

3. int epoll_wait(int epfd, struct epoll_event *events, int maxevents, int timeout);

等待事件的产生,并将产生的事件放到 events ,避免像 select 和 poll 一样还需要遍历查询都哪些 fd 上有事件

 二、epoll 函数分析

SYSCALL_DEFINE1(epoll_create, int, size)
{
    /*就判断一下非负值就没用了?*/
	if (size <= 0)
		return -EINVAL;

	return sys_epoll_create1(0);
}

可以看到,size 参数只要不是负值,也没什么用,并没有向下传递,接下来看 sys_epoll_create1 

SYSCALL_DEFINE1(epoll_create1, int, flags)
{
	int error, fd;
	struct eventpoll *ep = NULL;
	struct file *file;

	/* Check the EPOLL_* constant for consistency.  */
	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
    /*由 epoll_creat 传递的 0 */
	if (flags & ~EPOLL_CLOEXEC)
		return -EINVAL;
	/*
	 * Create the internal data structure ("struct eventpoll").
	 */
	/*申请 eventpoll 结构体并初始化*/
	error = ep_alloc(&ep);
	if (error < 0)
		return error;
	/*
	 * Creates all the items needed to setup an eventpoll file. That is,
	 * a file structure and a free file descriptor.
	 */
	/*从当前进程的文件表中取一个未用的 fd ,并设置可读写的权限*/
	fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
	if (fd < 0) {
		error = fd;
		goto out_free_ep;
	}
    /*再匿名文件系统中创建一个匿名文件( inode) ,并返回 file 结构体*/
	file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
				 O_RDWR | (flags & O_CLOEXEC));
           {
            	struct qstr this;
            	struct path path;
            	struct file *file;
                /*匿名文件系统初始化后就为合法值*/
            	if (IS_ERR(anon_inode_inode))
            		return ERR_PTR(-ENODEV);
                /* fops 是不是很眼熟,驱动的工作不就是填充它的成员吗,这里直接用的 eventpoll_fops */
            	if (fops->owner && !try_module_get(fops->owner))
            		return ERR_PTR(-ENOENT);

            	/*
            	 * Link the inode to a directory entry by creating a unique name
            	 * using the inode sequence number.
            	 */
            	file = ERR_PTR(-ENOMEM);
            	this.name = name;
            	this.len = strlen(name);
            	this.hash = 0;
                /*inode 有了 ,再申请一个目录项 dentry */
            	path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this);
            	if (!path.dentry)
            		goto err_module;

            	path.mnt = mntget(anon_inode_mnt);
            	/*
            	 * We know the anon_inode inode count is always greater than zero,
            	 * so ihold() is safe.
            	 */
            	ihold(anon_inode_inode);
                /*denty 和 inode 关联*/
            	d_instantiate(path.dentry, anon_inode_inode);
                /*申请并填充 file 结构体*/
            	file = alloc_file(&path, OPEN_FMODE(flags), fops);
            	if (IS_ERR(file))
            		goto err_dput;
            	file->f_mapping = anon_inode_inode->i_mapping;

            	file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
            	file->private_data = priv;

            	return file;

            err_dput:
            	path_put(&path);
            err_module:
            	module_put(fops->owner);
            	return file;
            }
	if (IS_ERR(file)) {
		error = PTR_ERR(file);
		goto out_free_fd;
	}
    /*将file 赋值给 eventpoll */
	ep->file = file;
	fd_install(fd, file);
	return fd;

out_free_fd:
	put_unused_fd(fd);
out_free_ep:
	ep_free(ep);
	return error;
}

这段代码就非常有意思,

1. 先获取一个进程未使用的 fd ,作为 epoll_creat 返回值

2. 在匿名文件系统中创建一个 eventpoll 的文件(创建 dentry 并链接 inode),再分配 file 结构体并填充相关内容,fops 用的是 eventpoll_fops

3. fd 和 file 关联

4. eventpoll 是一个非常重要的结构体,作为 私有结构赋值给 file

整个过程就类似于 open 函数创建一个 eventpoll 文件并打开,返回 fd ,可以简单认为,epoll_creat 再匿名文件系统中创建了一个 eventpoll 文件并打开。

下面看  epoll_ctl :

SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
		struct epoll_event __user *, event)
{
	int error;
	int full_check = 0;
	struct fd f, tf;
	struct eventpoll *ep;
	struct epitem *epi;
	struct epoll_event epds;
	struct eventpoll *tep = NULL;

	error = -EFAULT;
    /*只要不是 EPOLL_CTL_DEL ,就先 copy 进来*/
	if (ep_op_has_event(op) &&
	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
		goto error_return;

	error = -EBADF;
    /*根据 epfd 获取 file */
	f = fdget(epfd);
	if (!f.file)
		goto error_return;

	/* Get the "struct file *" for the target file */
    /*再获取要操作 fd 的 file */
	tf = fdget(fd);
	if (!tf.file)
		goto error_fput;

	/* The target file descriptor must support poll */
	error = -EPERM;
    /*要监听 fd 的 f_op->poll 不能为 NULL,或者说驱动必须挂接了 poll 成员*/
	if (!tf.file->f_op->poll)
		goto error_tgt_fput;

	/* Check if EPOLLWAKEUP is allowed */
	if (ep_op_has_event(op))
        /*将 EPOLLWAKEUP 标志清除了 */
		ep_take_care_of_epollwakeup(&epds);

	/*
	 * We have to check that the file structure underneath the file descriptor
	 * the user passed to us _is_ an eventpoll file. And also we do not permit
	 * adding an epoll file descriptor inside itself.
	 */
	error = -EINVAL;
    /*上面的注释写的很清楚 ,不能把 epfd 给添加进来*/
	if (f.file == tf.file || !is_file_epoll(f.file))
		goto error_tgt_fput;

	/*
	 * At this point it is safe to assume that the "private_data" contains
	 * our own data structure.
	 */
	/*获取到 eventpoll ,在 epoll_creat 时挂接的,驱动开发中也常将一些私有结构赋值给 file->private_data */
	ep = f.file->private_data;

	/*
	 * When we insert an epoll file descriptor, inside another epoll file
	 * descriptor, there is the change of creating closed loops, which are
	 * better be handled here, than in more critical paths. While we are
	 * checking for loops we also determine the list of files reachable
	 * and hang them on the tfile_check_list, so we can check that we
	 * haven't created too many possible wakeup paths.
	 *
	 * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
	 * the epoll file descriptor is attaching directly to a wakeup source,
	 * unless the epoll file descriptor is nested. The purpose of taking the
	 * 'epmutex' on add is to prevent complex toplogies such as loops and
	 * deep wakeup paths from forming in parallel through multiple
	 * EPOLL_CTL_ADD operations.
	 */
	mutex_lock_nested(&ep->mtx, 0);
    /*添加 fd */
	if (op == EPOLL_CTL_ADD) {

        /*epfd 的 f_ep_links 不为 NULL 或者传递进来的 fd 是 epfd */
		if (!list_empty(&f.file->f_ep_links) ||
						is_file_epoll(tf.file)) {
			full_check = 1;
			mutex_unlock(&ep->mtx);
			mutex_lock(&epmutex);
            /*添加的 fd 不能时 epoll 的fd*/
			if (is_file_epoll(tf.file)) {
				error = -ELOOP;
				if (ep_loop_check(ep, tf.file) != 0) {
					clear_tfile_check_list();
					goto error_tgt_fput;
				}
			} else
			/* 将添加的 fd 对应的file 插入到 tfile_check_list */ 
				list_add(&tf.file->f_tfile_llink,
							&tfile_check_list);
			mutex_lock_nested(&ep->mtx, 0);
			if (is_file_epoll(tf.file)) {
				tep = tf.file->private_data;
				mutex_lock_nested(&tep->mtx, 1);
			}
		}
	}

	/*
	 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
	 * above, we can be sure to be able to use the item looked up by
	 * ep_find() till we release the mutex.
	 */
	/*从 eventpoll 中找到传递进来 fd 对应的 epitem */
	epi = ep_find(ep, tf.file, fd);

	error = -EINVAL;
	switch (op) {
	case EPOLL_CTL_ADD:
        /*既然要add就说明之前不存在 epitem ,否则重复添加,报错*/
		if (!epi) {
			epds.events |= POLLERR | POLLHUP;
			error = ep_insert(ep, &epds, tf.file, fd, full_check);
            {
            	int error, revents, pwake = 0;
            	unsigned long flags;
            	long user_watches;
            	struct epitem *epi;
            	struct ep_pqueue epq;
                /*获取当前 eventpoll 监控的数量 */
            	user_watches = atomic_long_read(&ep->user->epoll_watches);
            	if (unlikely(user_watches >= max_user_watches))
            		return -ENOSPC;
                /*创建 eventpoll 条目*/
            	if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
            		return -ENOMEM;

            	/* Item initialization follow here ... */
                /*就绪链表*/
            	INIT_LIST_HEAD(&epi->rdllink);
                /*file 链表 */
            	INIT_LIST_HEAD(&epi->fllink);
                /* wait 链表*/
            	INIT_LIST_HEAD(&epi->pwqlist);
            	epi->ep = ep;
            	ep_set_ffd(&epi->ffd, tfile, fd);
            	epi->event = *event;
            	epi->nwait = 0;
            	epi->next = EP_UNACTIVE_PTR;
            	if (epi->event.events & EPOLLWAKEUP) {
                    /*申请 wakeupsource */
            		error = ep_create_wakeup_source(epi);
            		if (error)
            			goto error_create_wakeup_source;
            	} else {
            		RCU_INIT_POINTER(epi->ws, NULL);
            	}

            	/* Initialize the poll table using the queue callback */
            	epq.epi = epi;
                /*驱动实现 poll 的时候调用 poll_wait 进而调用 ep_ptable_queue_proc */
            	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
                                           {
                                                /*创建 eppoll_entry 并加入等待队列头,同时加入 epi->pwqlist */
                                            	struct epitem *epi = ep_item_from_epqueue(pt);
                                            	struct eppoll_entry *pwq;

                                            	if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
                                            		init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
                                            		pwq->whead = whead;
                                            		pwq->base = epi;
                                            		add_wait_queue(whead, &pwq->wait);
                                            		list_add_tail(&pwq->llink, &epi->pwqlist);
                                            		epi->nwait++;
                                            	} else {
                                            		/* We have to signal that an error occurred */
                                            		epi->nwait = -1;
                                            	}
                                            }

            	/*
            	 * Attach the item to the poll hooks and get current event bits.
            	 * We can safely use the file* here because its usage count has
            	 * been increased by the caller of this function. Note that after
            	 * this operation completes, the poll callback can start hitting
            	 * the new item.
            	 */
            	/*调用驱动实现的 poll ,并返回结果*/
            	revents = ep_item_poll(epi, &epq.pt);
                          {
                            	pt->_key = epi->event.events;

                            	return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events;
                           }

            	/*
            	 * We have to check if something went wrong during the poll wait queue
            	 * install process. Namely an allocation for a wait queue failed due
            	 * high memory pressure.
            	 */
            	error = -ENOMEM;
            	if (epi->nwait < 0)
            		goto error_unregister;

            	/* Add the current item to the list of active epoll hook for this file */
            	spin_lock(&tfile->f_lock);
            	list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
            	spin_unlock(&tfile->f_lock);

            	/*
            	 * Add the current item to the RB tree. All RB tree operations are
            	 * protected by "mtx", and ep_insert() is called with "mtx" held.
            	 */
            	ep_rbtree_insert(ep, epi);

            	/* now check if we've created too many backpaths */
            	error = -EINVAL;
            	if (full_check && reverse_path_check())
            		goto error_remove_epi;

            	/* We have to drop the new item inside our item list to keep track of it */
            	spin_lock_irqsave(&ep->lock, flags);

            	/* If the file is already "ready" we drop it inside the ready list */
                /*如果有就绪的epi       插入到 eventpoll 的就绪链表中*/
            	if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
            		list_add_tail(&epi->rdllink, &ep->rdllist);
            		ep_pm_stay_awake(epi);

            		/* Notify waiting tasks that events are available */
            		if (waitqueue_active(&ep->wq))
            			wake_up_locked(&ep->wq);
            		if (waitqueue_active(&ep->poll_wait))
            			pwake++;
            	}

            	spin_unlock_irqrestore(&ep->lock, flags);

            	atomic_long_inc(&ep->user->epoll_watches);

            	/* We have to call this outside the lock */
            	if (pwake)
            		ep_poll_safewake(&ep->poll_wait);

            	return 0;

            error_remove_epi:
            	spin_lock(&tfile->f_lock);
            	list_del_rcu(&epi->fllink);
            	spin_unlock(&tfile->f_lock);

            	rb_erase(&epi->rbn, &ep->rbr);

            error_unregister:
            	ep_unregister_pollwait(ep, epi);

            	/*
            	 * We need to do this because an event could have been arrived on some
            	 * allocated wait queue. Note that we don't care about the ep->ovflist
            	 * list, since that is used/cleaned only inside a section bound by "mtx".
            	 * And ep_insert() is called with "mtx" held.
            	 */
            	spin_lock_irqsave(&ep->lock, flags);
            	if (ep_is_linked(&epi->rdllink))
            		list_del_init(&epi->rdllink);
            	spin_unlock_irqrestore(&ep->lock, flags);

            	wakeup_source_unregister(ep_wakeup_source(epi));

            error_create_wakeup_source:
            	kmem_cache_free(epi_cache, epi);

            	return error;
            }
		} else
			error = -EEXIST;
		if (full_check)
			clear_tfile_check_list();
		break;
	case EPOLL_CTL_DEL:
		if (epi)
			error = ep_remove(ep, epi);
		else
			error = -ENOENT;
		break;
	case EPOLL_CTL_MOD:
		if (epi) {
			epds.events |= POLLERR | POLLHUP;
			error = ep_modify(ep, epi, &epds);
		} else
			error = -ENOENT;
		break;
	}
	if (tep != NULL)
		mutex_unlock(&tep->mtx);
	mutex_unlock(&ep->mtx);

error_tgt_fput:
	if (full_check)
		mutex_unlock(&epmutex);

	fdput(tf);
error_fput:
	fdput(f);
error_return:

	return error;
}

简单总结以下:

1.  根据 epfd、fd 获取到 相应的 file 

2. 将 epoll_event copy 到内核

3.  要保证传递的 fd 不能是 epoll 的 fd ,且其 file->f_op->poll 不能为 NULL

4.  查询 fd 对应的 epi(每添加一个 fd ,为其分配一个 epi 结构体及 ep_pqueue ) ,

5. EPOLL_CTL_ADD 为 添加的 fd 创建对应的 epi 及 eppoll_entry,调用驱动 poll 时,将 eppoll_entry 添加到文件的等待队列头,其  eppoll_entry 回调函数为 ep_poll_callback ,

6. 接下来调用 驱动对应的 poll 函数,得到返回值,如果有事件,将 epi->rdllink (可通过 epi 获得 fd 及其 event )插入到 ep->rdllist ,同时wakeup 因sys_epoll_wait(ep->wq)和 by file->poll (ep->poll_wait) 而休眠的进程(其实就是调用 epoll_wait 的进程)

下面看一个非常关键的函数 ep_poll_callback ,当监控文件有 事件产生时,会被 wake up,并调用等待队列的回调函数 ep_poll_callback 

static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
	int pwake = 0;
	unsigned long flags;
	struct epitem *epi = ep_item_from_wait(wait);   //由等待队列获取到 epi 
	struct eventpoll *ep = epi->ep;   //获取到 eventpoll

	spin_lock_irqsave(&ep->lock, flags);

	/*
	 * If the event mask does not contain any poll(2) event, we consider the
	 * descriptor to be disabled. This condition is likely the effect of the
	 * EPOLLONESHOT bit that disables the descriptor when an event is received,
	 * until the next EPOLL_CTL_MOD will be issued.
	 */
	if (!(epi->event.events & ~EP_PRIVATE_BITS))
		goto out_unlock;

	/*
	 * Check the events coming with the callback. At this stage, not
	 * every device reports the events in the "key" parameter of the
	 * callback. We need to be able to handle both cases here, hence the
	 * test for "key" != NULL before the event match test.
	 */
	if (key && !((unsigned long) key & epi->event.events))
		goto out_unlock;

	/*
	 * If we are transferring events to userspace, we can hold no locks
	 * (because we're accessing user memory, and because of linux f_op->poll()
	 * semantics). All the events that happen during that period of time are
	 * chained in ep->ovflist and requeued later on.
	 */
	/* 将epi 插入到 ep->ovflist */
	if (ep->ovflist != EP_UNACTIVE_PTR) {
		if (epi->next == EP_UNACTIVE_PTR) {
			epi->next = ep->ovflist;
			ep->ovflist = epi;
			if (epi->ws) {
				/*
				 * Activate ep->ws since epi->ws may get
				 * deactivated at any time.
				 */
				__pm_stay_awake(ep->ws);
			}

		}
		goto out_unlock;
	}

	/* If this file is already in the ready list we exit soon */
   /*epi 掺入到 ep 的就绪链表*/
	if (!ep_is_linked(&epi->rdllink)) {
		list_add_tail(&epi->rdllink, &ep->rdllist);
		ep_pm_stay_awake_rcu(epi);
	}

	/*
	 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
	 * wait list.
	 */
	 /* 激活 ep 的等待队列 */
	if (waitqueue_active(&ep->wq))
		wake_up_locked(&ep->wq);
	if (waitqueue_active(&ep->poll_wait))
		pwake++;

out_unlock:
	spin_unlock_irqrestore(&ep->lock, flags);

	/* We have to call this outside the lock */
	if (pwake)
		ep_poll_safewake(&ep->poll_wait);

    /*查询到就删除的情况*/
	if ((unsigned long)key & POLLFREE) {
		/*
		 * If we race with ep_remove_wait_queue() it can miss
		 * ->whead = NULL and do another remove_wait_queue() after
		 * us, so we can't use __remove_wait_queue().
		 */
		list_del_init(&wait->task_list);
		/*
		 * ->whead != NULL protects us from the race with ep_free()
		 * or ep_remove(), ep_remove_wait_queue() takes whead->lock
		 * held by the caller. Once we nullify it, nothing protects
		 * ep/epi or even wait.
		 */
		smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
	}

	return 1;
}

从代码中可知,当监控的 fd 有事件发生,竟会wakeup 等待队列,然后调用 ep_poll_callback ,而该函数总结下来就做了一件事,就是将 对应的 epi 插入到  eventpoll 的就绪链表中。

猜测  epoll_wait 的主要任务就是监控 eventpoll 的就绪链表,然后将相应的 epi 转发为 event 输出给用户程序。

下面看 epoll_wait :

SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
		int, maxevents, int, timeout)
{
	int error;
	struct fd f;
	struct eventpoll *ep;

	/* The maximum number of event must be greater than zero */
    /*判断监控的最大 event 数合法 */
	if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
		return -EINVAL;

	/* Verify that the area passed by the user is writeable */
    /*必须验证用户空间的可写属性或者说合法性*/
	if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))
		return -EFAULT;

	/* Get the "struct file *" for the eventpoll file */
    /*过去 file 结构体*/
	f = fdget(epfd);
	if (!f.file)
		return -EBADF;

	/*
	 * We have to check that the file structure underneath the fd
	 * the user passed to us _is_ an eventpoll file.
	 */
	error = -EINVAL;
    /* 必须是 epoll 的 fd */
	if (!is_file_epoll(f.file))
		goto error_fput;

	/*
	 * At this point it is safe to assume that the "private_data" contains
	 * our own data structure.
	 */
	 /* 获取 eventpoll ,epoll_creat 时分配的 */
	ep = f.file->private_data;

	/* Time to fish for events ... */
	error = ep_poll(ep, events, maxevents, timeout);
            {
            	int res = 0, eavail, timed_out = 0;
            	unsigned long flags;
            	u64 slack = 0;
            	wait_queue_t wait;
            	ktime_t expires, *to = NULL;

            	if (timeout > 0) {
            		struct timespec end_time = ep_set_mstimeout(timeout);

            		slack = select_estimate_accuracy(&end_time);
            		to = &expires;
            		*to = timespec_to_ktime(end_time);
            	} else if (timeout == 0) {
            		/*
            		 * Avoid the unnecessary trip to the wait queue loop, if the
            		 * caller specified a non blocking operation.
            		 */
            		/*非阻塞式访问,不用添加等待队列,检查完就返回*/
            		timed_out = 1;
            		spin_lock_irqsave(&ep->lock, flags);
            		goto check_events;
            	}

            fetch_events:
            	spin_lock_irqsave(&ep->lock, flags);

            	if (!ep_events_available(ep)) {
            		/*
            		 * We don't have any available event to return to the caller.
            		 * We need to sleep here, and we will be wake up by
            		 * ep_poll_callback() when events will become available.
            		 */
            		 /*将当前进程添加到等待队列 ep->wq */
            		init_waitqueue_entry(&wait, current);
            		__add_wait_queue_exclusive(&ep->wq, &wait);

                    /*要么事件超时,要么被事件唤醒(epoll_ctl 和 ep_poll_callback 都会唤醒)*/
            		for (;;) {
            			/*
            			 * We don't want to sleep if the ep_poll_callback() sends us
            			 * a wakeup in between. That's why we set the task state
            			 * to TASK_INTERRUPTIBLE before doing the checks.
            			 */
            			set_current_state(TASK_INTERRUPTIBLE);
            			if (ep_events_available(ep) || timed_out)
            				break;
            			if (signal_pending(current)) {
            				res = -EINTR;
            				break;
            			}

            			spin_unlock_irqrestore(&ep->lock, flags);
            			if (!freezable_schedule_hrtimeout_range(to, slack,
            								HRTIMER_MODE_ABS))
            				timed_out = 1;

            			spin_lock_irqsave(&ep->lock, flags);
            		}

            		__remove_wait_queue(&ep->wq, &wait);
            		__set_current_state(TASK_RUNNING);
            	}
            check_events:
            	/* Is it worth to try to dig for events ? */
            	eavail = ep_events_available(ep);

            	spin_unlock_irqrestore(&ep->lock, flags);

            	/*
            	 * Try to transfer events to user space. In case we get 0 events and
            	 * there's still timeout left over, we go trying again in search of
            	 * more luck.
            	 */
            	 /*有就绪的事件(eventpoll 就绪队列不为 空),将epi 转换发送到用户空间的 events */
            	if (!res && eavail &&
            	    !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
            		goto fetch_events;

            	return res;
            }

error_fput:
	fdput(f);
	return error;
}

从上面的代码可知,除了一些参数合法性判断,其实就是检查 eventpoll 就绪链表(ep->rdllist) 是否有epi ,然后将 epi 转换成 events 再输出给用户态。

总结:做过底层开发的人都知道,通常对于硬件事件有 查询法 中断 两种方式,select和poll 就类似 查询法 ,而 epoll 则类似中断法。而实际底层开发过程中,一般也是用中断的方法处理硬件事件,

避免了 CPU 一次次的白转。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值