epoll源码剖析(二)

最新推荐文章于 2022-11-24 14:31:38 发布

神仙404

最新推荐文章于 2022-11-24 14:31:38 发布

阅读量143

点赞数

分类专栏： Linux源码剖析文章标签： epoll linux

本文链接：https://blog.csdn.net/qq_41345173/article/details/104099624

版权

Linux源码剖析专栏收录该内容

24 篇文章 9 订阅

订阅专栏

上一篇：epoll源码剖析(一)

一、epoll_wait()系统调用解析

[1]sys_epoll_wait源码剖析

SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,int, maxevents, int, timeout)
{//linux-4.13.16\fs\eventpoll.c
	int error;
	struct fd f;
	struct eventpoll *ep;
	/* 校验输入参数maxevents的合法性 */
	if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
		return -EINVAL;
	/* Verify that the area passed by the user is writeable */
	if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))
		return -EFAULT;
	/* Get the "struct file *" for the eventpoll file */
	f = fdget(epfd);
	if (!f.file)
		return -EBADF;
	/*确保找到的文件是对应epfd对应的文件，主要通过is_file_epoll()函数验证f.file->f_op操作是否为eventpoll_fops*/
	error = -EINVAL;
	if (!is_file_epoll(f.file))
		goto error_fput;
	//获取至关重要的eventpoll对象
    ep = f.file->private_data;
    /* Time to fish for events ... */
	error = ep_poll(ep, events, maxevents, timeout);
error_fput:
	fdput(f);
	return error;
}

通过上面代码可以看出，sys_epoll_wait主要做两件事，第一是在验证输入参数的合法性之后获取到epoll实例对应的eventpoll对象ep；第二是调用ep_poll来完成对应事件的收集并传递到用户空间。

[2]ep_poll源码剖析

static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
		   int maxevents, long timeout)
{//linux-4.13.16\fs\eventpoll.c
	int res = 0, eavail, timed_out = 0;
	unsigned long flags;
	u64 slack = 0;
	wait_queue_entry_t wait;
	ktime_t expires, *to = NULL;

	if (timeout > 0) {
		struct timespec64 end_time = ep_set_mstimeout(timeout);

		slack = select_estimate_accuracy(&end_time);
		to = &expires;
		*to = timespec64_to_ktime(end_time);
	} else if (timeout == 0) {
		/*
		 * Avoid the unnecessary trip to the wait queue loop, if the
		 * caller specified a non blocking operation.
		 */
		timed_out = 1;
		spin_lock_irqsave(&ep->lock, flags);
		goto check_events;
	}

首先进入ep_poll函数会根据timeout参数确定等待机制，即我们常说的timeout<0一直等待直至有事件发生；timeout=0立即查看是否有事件发生；timeout>0则是设置一定的超时时间。由以上代码可以看到if (timeout > 0)会设置一end_time;而若else if (timeout == 0)，则先加锁然后直接跳转至goto check_events;去检查是否有事件发生；自然当timeout<0时直接跳过条件判断执行下面的函数逻辑即可。

fetch_events://只有当timeout>0或timeout<0时才会执行此处分支
    ...
	spin_lock_irqsave(&ep->lock, flags);
    
	if (!ep_events_available(ep)) {
        ...
		/*
		 * We don't have any available event to return to the caller.
		 * We need to sleep here, and we will be wake up by
		 * ep_poll_callback() when events will become available.
		 */
		init_waitqueue_entry(&wait, current);
		__add_wait_queue_exclusive(&ep->wq, &wait);

spin_lock_irqsave()宏先对eventpoll ep加锁，然后利用ep_events_available(ep)判断是否有事件发生，若深入剖析下去会发现是通过检查ep->rdllist来检测的，若没有事件发生会利用__add_wait_queue_exclusive()函数将当前进程挂在eventpoll->wq成员上，那么它是怎样挂在上面的哩？这和wait_queue_entry_t结构体的成员变量有很大关系，wait_queue_entry有一个void *的成员变量可以存放task_struct结构体变量，还有一个struct list_head 的成员变量可以用来挂在eventpoll上，主要就是通过这两个成员变量来实现上述逻辑的。接下来就利用一个for循环不断检测是否有事件发生：

		for (;;) {
			/*
			 * We don't want to sleep if the ep_poll_callback() sends us
			 * a wakeup in between. That's why we set the task state
			 * to TASK_INTERRUPTIBLE before doing the checks.
			 */
			set_current_state(TASK_INTERRUPTIBLE);
			/*
			 * Always short-circuit for fatal signals to allow
			 * threads to make a timely exit without the chance of
			 * finding more events available and fetching
			 * repeatedly.
			 */
			if (fatal_signal_pending(current)) {
				res = -EINTR;
				break;
			}
			if (ep_events_available(ep) || timed_out)
				break;
			if (signal_pending(current)) {
				res = -EINTR;
				break;
			}

			spin_unlock_irqrestore(&ep->lock, flags);
			if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
				timed_out = 1;

			spin_lock_irqsave(&ep->lock, flags);
		}

在上面的for循环中先设置当前进程为可中断进程，然后若并无事件发生则先对ep解锁，然调用schedule_hrtimeout_range()将当前进程陷入休眠，CPU时间被调度器调度给其他进程使用。当然，当前进程可能会被唤醒，其对应对ep解锁前3个条件判断，也即：
1.当前进程超时或者某个描述符上有事件发生
2.当前进程收到了一个signal信号
3.当前进程被CPU重新调度，进入for循环重新判断，若未满足上述两个条件则又重新进入休眠
接下来如果进程从休眠中醒来，则将当前进程从eventpoll的等待队列中删除，并且设置当前进程为TASK_RUNNING：

		__remove_wait_queue(&ep->wq, &wait);
		__set_current_state(TASK_RUNNING);
	}

最后如下所示，在经过重新判断有无事件发生时，最后调用ep_send_events()将事件拷贝到用户空间。

check_events:
	/* Is it worth to try to dig for events ? */
	eavail = ep_events_available(ep);

	spin_unlock_irqrestore(&ep->lock, flags);

	/*
	 * Try to transfer events to user space. In case we get 0 events and
	 * there's still timeout left over, we go trying again in search of
	 * more luck.
	 */
	if (!res && eavail &&
	    !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
		goto fetch_events;

	return res;
}

[3]ep_send_events源码剖析

//linux-4.13.16\fs\eventpoll.c
struct ep_send_events_data {
	int maxevents;
	struct epoll_event __user *events;
};
static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, int maxevents)
{
	struct ep_send_events_data esed;

	esed.maxevents = maxevents;
	esed.events = events;
    //ep_send_events_proc是一个函数指针，用作回调函数
	return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
}

[4]ep_scan_ready_list源码剖析

static int ep_scan_ready_list(struct eventpoll *ep,
			      int (*sproc)(struct eventpoll *,
					   struct list_head *, void *),
			      void *priv, int depth, bool ep_locked)
{
    ...
	error = (*sproc)(ep, &txlist, priv);
}

ep_scan_ready_list 函数调用 ep_send_events_proc 对每个已经就绪的事件循环处理。ep_send_events_proc 循环处理就绪事件时，会再次调用每个文件描述符的 poll 方法，以便确定确实有事件发生:

static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
			       void *priv)
{
    ...
    //确保事件仍旧有效，避免失效
    revents = ep_item_poll(epi, &pt);
    
    if (revents) {
			if (__put_user(revents, &uevent->events) ||
			    __put_user(epi->event.data, &uevent->data)) {
				list_add(&epi->rdllink, head);
				ep_pm_stay_awake(epi);
				return eventcnt ? eventcnt : -EFAULT;
			}
			eventcnt++;
			uevent++;
		    if (epi->event.events & EPOLLONESHOT)
				epi->event.events &= EP_PRIVATE_BITS;
			else if (!(epi->event.events & EPOLLET)) {
				/*
				 * If this file has been added with Level
				 * Trigger mode, we need to insert back inside
				 * the ready list, so that the next call to
				 * epoll_wait() will check again the events
				 * availability. At this point, no one can insert
				 * into ep->rdllist besides us. The epoll_ctl()
				 * callers are locked out by
				 * ep_scan_ready_list() holding "mtx" and the
				 * poll callback will queue them in ep->ovflist.
				 */
				list_add_tail(&epi->rdllink, &ep->rdllist);
				ep_pm_stay_awake(epi);
			}
		}
	}

	return eventcnt;
}

接下来在进行简单的事件掩码校验之后，ep_send_events_proc 将事件结构体拷贝到用户空间需要的数据结构中。这是通过 __put_user 方法完成的。

二、Level-triggered VS Edge-triggered

epoll有水平触发和边缘触发两种模式，默认为LT模式。边缘触发(ET)，读接收缓冲区中的数据的时候，读完一部分数据就会减少一部分，减少的时候不会触发，只有当客户端再次发送数据，接收数据缓存区数据有一个上升状态才会触发epoll；水平触发(LT),当缓冲区有数据时epoll_wait会不断得到触发。总之，为减少epoll_wait()的无效出发次数以及可以一次将缓冲区里边的数据全部读完(为了提高效率)我们应该采用ET模式，那么在源码级别上他们的区别体现在何方哩？
看一看,sys_epoll_wait先获取epollfd实例对应的eventpoll ep；然后调用ep_poll来完成对应事件的收集并传递到用户空间。ep_poll根据timeout参数确定合理的函数执行逻辑，在之后若有监听事件发生则调用ep_send_events，其再调用ep_scan_ready_list函数将每一个发生事件利用ep_send_events_proc函数拷贝到用户空间，也即最初传入epoll_wait的传出参数epoll_event中。而ET与LT的具体差别就体现在ep_send_events_proc函数中，ep_send_events_proc函数重点关注：

            if (epi->event.events & EPOLLONESHOT)
				epi->event.events &= EP_PRIVATE_BITS;
			else if (!(epi->event.events & EPOLLET)) {
				list_add_tail(&epi->rdllink, &ep->rdllist);
				ep_pm_stay_awake(epi);
			}

那么在epoll默认出发模式也即LT模式下，应该执行上面的else if (!(epi->event.events & EPOLLET)) {逻辑，它会将当前的 epoll_item 对象被重新加到 eventpoll 的就绪列表中，这样在下一次 epoll_wait 调用时，这些 epoll_item 对象就会被重新处理。而真正决定他会不会再次被传递到用户空间取决于前面ep_send_events_proc函数再次调用监听对象的poll方法之后(即ep_item_poll(epi, &pt)函数) ，确定这个事件是不是依然有效。所以，如果用户空间程序已经处理掉该事件，就不会被再次通知；如果没有处理，意味着该事件依然有效，就会被再次通知(即if(revents))。