非阻塞io之 epoll内核实现

最新推荐文章于 2024-05-13 20:03:46 发布

山羊哥-老宋

最新推荐文章于 2024-05-13 20:03:46 发布

阅读量489

点赞数

分类专栏： linux-kernel-4.19 文章标签： linux epoll 操作系统 kernel 事件驱动

本文链接：https://blog.csdn.net/qq_22418329/article/details/109204280

版权

linux-kernel-4.19 专栏收录该内容

13 篇文章 3 订阅

订阅专栏

epoll

创建epollFd
- epollFD文件描述符的回调实现
epoll_ctl 系统调用
epoll_wait系统调用

可以结合这篇文章一起看 eventfd
由于需要实现一个驱动想封装出epoll的设备，在网上找关于epoll的信息，好少所以自己看了内核的代码了解以下epoll机制，做个总结以便自己后面查看。

/*
 * 这个结构存储在`struct file`结构的 "private_data"成员中，也是eventpoll接口的主要数据结构。
 * 访问它受到 wq 内部锁的保护。
 */
struct eventpoll {
	// fs/eventpoll.c
	/*
	 * 此互斥锁用于确保在epoll使用文件时不会删除文件。
	 * 这是在事件收集循环，文件清理路径，epoll文件出口代码和ctl操作期间保留的。
	 */
	struct mutex mtx;

	/* sys_epoll_wait（）使用的等待队列 */
	/* 调用epoll_wait方法的线程在被堵塞之前会放相应的信息在这个队列里
     这样当有监听事件发生时，这些线程就可以被唤醒 */
	wait_queue_head_t wq;

	/* 等待者使用的等待队列 file->poll() */
	wait_queue_head_t poll_wait;

	/*准备好的文件描述符列表*/
	/* 被监听的socket文件有对应的事件生成后，就会被放到这个队列中 */
	struct list_head rdllist;

	/* RB树根，用于存储受监视的fd结构 */
	/* 被监听的socket文件会被放到这个数据结构里，红黑树 */
	struct rb_root_cached rbr;

	/*
	 * This is a single linked list that chains all the "struct epitem" that
	 * happened while transferring ready events to userspace w/out
	 * holding ->wq.lock.
	 * 这是一个单链表，其中链接了将就绪事件转移到用户空间 w/out 
	 * 持有 ->wq.lock时发生的所有“struct epitem”。
	 */
	struct epitem *ovflist;
 
	/* ep_scan_ready_list运行时使用的wakeup_source */
	struct wakeup_source *ws;

	/* 创建事件轮询描述符的用户 */
	struct user_struct *user;

	struct file *file;

	/* 用于优化回路检测检查 */
	int visited;
	struct list_head visited_list_link;

#ifdef CONFIG_NET_RX_BUSY_POLL
	/* 用于跟踪繁忙的民意调查napi_id */
	unsigned int napi_id;
#endif
};

创建epollFd

他有俩个系统调用epoll_create1和epoll_create

调用ep_alloc方法创建一个eventpoll实例
调用get_unused_fd_flags方法找到一个未使用的fd，这个就是最终返回给我们的文件描述符。
调用anon_inode_getfile方法创建一个file实例，其类型为

// include/linux/fs.h
struct file {
  //...
  // 这个struct里存放了各种函数指针，用来指向操作文件的各种函数
  // 比如read/write等。这样不同类型的文件，就可以有不同的函数实现
  const struct file_operations  *f_op;
  //...
  // struct file 里的数据字段存放的是所有file类型通用的数据
  // 而下面这个字段存放的是和具体文件类型相关的数据
  void      *private_data;
  //...
}

调用anon_inode_getfile方法传入的参数中，eventpoll_fops最终被赋值到上面的f_op字段，ep被赋值到上面的private_data字段。
4. 调用fd_install方法在内核中建立 fd 与 file 的对应关系，这样以后就可以通过fd来找到对应的file。
5. 返回fd给用户。
至此，epoll_create1方法结束。

/*
 * 打开一个eventpoll文件描述符.
 */
static int do_epoll_create(int flags)
{
	int error, fd;
	struct eventpoll *ep = NULL;
	struct file *file;

	/* 检查EPOLL_* 常量的一致性. */
	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);

	if (flags & ~EPOLL_CLOEXEC)
		return -EINVAL;
	/*
	 * 创建内部数据结构 ("struct eventpoll").
	 */
	error = ep_alloc(&ep);
	if (error < 0)
		return error;
	/*
	 * Creates all the items needed to setup an eventpoll file. That is,
	 * a file structure and a free file descriptor.
	 * 创建设置事件轮询文件所需的所有项目。 即，文件结构和空闲的文件描述符。
	 */
	fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
	if (fd < 0) {
		error = fd;
		goto out_free_ep;
	}
	file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
				 O_RDWR | (flags & O_CLOEXEC));
	if (IS_ERR(file)) {
		error = PTR_ERR(file);
		goto out_free_fd;
	}
	ep->file = file;
	fd_install(fd, file);
	return fd;

out_free_fd:
	put_unused_fd(fd);
out_free_ep:
	ep_free(ep);
	return error;
}

SYSCALL_DEFINE1(epoll_create1, int, flags)
{
	return do_epoll_create(flags);
}

SYSCALL_DEFINE1(epoll_create, int, size)
{
	if (size <= 0)
		return -EINVAL;

	return do_epoll_create(0);
}

epollFD文件描述符的回调实现

/* 实现eventpoll文件行为的文件回调 */
static const struct file_operations eventpoll_fops = {
#ifdef CONFIG_PROC_FS
	.show_fdinfo	= ep_show_fdinfo,
#endif
	.release	= ep_eventpoll_release,
	.poll		= ep_eventpoll_poll,
	.llseek		= noop_llseek,
};

epoll_ctl 系统调用

int epoll_ctl(int epfd, intop, int fd, struct epoll_event*event);

epoll的事件注册函数，它不同与select()是在监听事件时告诉内核要监听什么类型的事件，而是在这里先注册要监听的事件类型。

/* 要发布给sys_epoll_ctl()的有效操作码  */
#define EPOLL_CTL_ADD 1
#define EPOLL_CTL_DEL 2
#define EPOLL_CTL_MOD 3

第一个参数是epoll_create()的返回值，
第二个参数表示动作，用三个宏来表示：

EPOLL_CTL_ADD：注册新的fd到epfd中；
EPOLL_CTL_MOD：修改已经注册的fd的监听事件；
EPOLL_CTL_DEL：从epfd中删除一个fd；

第三个参数是需要监听的fd ，
第四个参数是告诉内核需要监听什么事件，struct epoll_event结构如下：

struct epoll_event {
	__poll_t events;  
	__u64 data;
} EPOLL_PACKED;

events 有以下掩码



/* Epoll event masks */
#define EPOLLIN		(__force __poll_t)0x00000001 //触发该事件，表示对应的文件描述符上有可读数据。(包括对端SOCKET正常关闭)；
#define EPOLLPRI	(__force __poll_t)0x00000002 //表示对应的文件描述符有紧急的数据可读（这里应该表示有带外数据到来）；
#define EPOLLOUT	(__force __poll_t)0x00000004 //触发该事件，表示对应的文件描述符上可以写数据；
#define EPOLLERR	(__force __poll_t)0x00000008 //表示对应的文件描述符发生错误；
#define EPOLLHUP	(__force __poll_t)0x00000010 //表示对应的文件描述符被挂断；
#define EPOLLNVAL	(__force __poll_t)0x00000020
#define EPOLLRDNORM	(__force __poll_t)0x00000040
#define EPOLLRDBAND	(__force __poll_t)0x00000080
#define EPOLLWRNORM	(__force __poll_t)0x00000100
#define EPOLLWRBAND	(__force __poll_t)0x00000200
#define EPOLLMSG	(__force __poll_t)0x00000400
#define EPOLLRDHUP	(__force __poll_t)0x00002000

/* 设置目标文件描述符的独占唤醒模式 */
#define EPOLLEXCLUSIVE	((__force __poll_t)(1U << 28))

/* 
 * 请求处理系统唤醒事件，以防止在处理这些事件时发生系统挂起。
 *  
 * 假设既未设置EPOLLET也未设置EPOLLONESHOT，则在使用唤醒事件后再次调用epoll_wait之前，不会重新允许系统挂起。
 * 需要CAP_BLOCK_SUSPEND
 */
#define EPOLLWAKEUP	((__force __poll_t)(1U << 29))

/* 设置目标文件描述符的“单发”行为  */
#define EPOLLONESHOT	((__force __poll_t)(1U << 30)) //只监听一次事件，当监听完这次事件之后，如果还需要继续监听这个socket的话，需要再次把这个socket加入到EPOLL队列里。

/* 为目标文件描述符设置 Edge Triggered (边缘触发)行为  */
#define EPOLLET		((__force __poll_t)(1U << 31)) //将EPOLL设为边缘触发(Edge Triggered)模式，这是相对于水平触发(Level Triggered)来说的

epoll_ctl的系统调用实现

/*
 * 以下功能为eventpoll文件实现了控制器接口，该接口允许集合中插入/删除/更改文件描述符。
 */
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
		struct epoll_event __user *, event)
{
	int error;
	int full_check = 0;
	struct fd f, tf;
	struct eventpoll *ep;
	struct epitem *epi;
	struct epoll_event epds;
	struct eventpoll *tep = NULL;

	error = -EFAULT;
	if (ep_op_has_event(op) &&
	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
		goto error_return;

	error = -EBADF;
	f = fdget(epfd);
	if (!f.file)
		goto error_return;

	/* 获取目标文件的“ struct file * ” */
	tf = fdget(fd);
	if (!tf.file)
		goto error_fput;

	/* 目标文件描述符必须支持轮询 */
	error = -EPERM;
	if (!file_can_poll(tf.file))
		goto error_tgt_fput;

	/* 检查是否允许 EPOLLWAKEUP */
	if (ep_op_has_event(op))
		ep_take_care_of_epollwakeup(&epds);

	/*
	 * 我们必须检查用户传递给我们_is_一个eventpoll文件的文件描述符下方的文件结构。 
	 * 而且我们也不允许在其内部添加epoll文件描述符。
	 */
	error = -EINVAL;
	if (f.file == tf.file || !is_file_epoll(f.file))
		goto error_tgt_fput;

	/*
	 * epoll仅在EPOLL_CTL_ADD时间添加到唤醒队列，
	 * 因此EPOLL_CTL_MOD操作不允许使用EPOLLEXCLUSIVE。 
	 * 另外，我们目前不支持嵌套排他唤醒。
	 */
	if (ep_op_has_event(op) && (epds.events & EPOLLEXCLUSIVE)) {
		if (op == EPOLL_CTL_MOD)
			goto error_tgt_fput;
		if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
				(epds.events & ~EPOLLEXCLUSIVE_OK_BITS)))
			goto error_tgt_fput;
	}

	/*
	 * 此时，可以安全地假设“ private_data”包含我们自己的数据结构。
	 */
	ep = f.file->private_data;

	/*
	 * 当我们在另一个epoll文件描述符内插入一个epoll文件描述符时，
	 * 创建闭环的变化有所变化，与更关键的路径相比，在此处更好地进行处理。 
	 * 在检查循环时，我们还确定了可访问的文件列表，并将其挂在tfile_check_list上，
	 * 以便可以检查是否没有创建太多可能的唤醒路径。
	 *
	 * 当epoll文件描述符直接附加到唤醒源时，
	 * 我们不需要在EPOLL_CTL_ADD上使用全局'epumutex'，除非epoll文件描述符是嵌套的。 
	 * 加上'epmutex'的目的是为了防止复杂的拓扑（例如循环和深度唤醒路径）
	 * 通过多个EPOLL_CTL_ADD操作并行形成。
	 */
	mutex_lock_nested(&ep->mtx, 0);
	if (op == EPOLL_CTL_ADD) {
		if (!list_empty(&f.file->f_ep_links) ||
						is_file_epoll(tf.file)) {
			full_check = 1;
			mutex_unlock(&ep->mtx);
			mutex_lock(&epmutex);
			if (is_file_epoll(tf.file)) {
				error = -ELOOP;
				if (ep_loop_check(ep, tf.file) != 0) {
					clear_tfile_check_list();
					goto error_tgt_fput;
				}
			} else
				list_add(&tf.file->f_tfile_llink,
							&tfile_check_list);
			mutex_lock_nested(&ep->mtx, 0);
			if (is_file_epoll(tf.file)) {
				tep = tf.file->private_data;
				mutex_lock_nested(&tep->mtx, 1);
			}
		}
	}

	/*
	 * 尝试在我们的RB树中查找文件，
	 * 由于我们在上面抓取了“ mtx”，
	 * 因此可以确定在释放互斥锁之前，
	 * 可以使用ep_find（）查找的项目。
	 */
	epi = ep_find(ep, tf.file, fd);

	error = -EINVAL;
	switch (op) {
	case EPOLL_CTL_ADD:
		if (!epi) {
			epds.events |= EPOLLERR | EPOLLHUP;
			error = ep_insert(ep, &epds, tf.file, fd, full_check);
		} else
			error = -EEXIST;
		if (full_check)
			clear_tfile_check_list();
		break;
	case EPOLL_CTL_DEL:
		if (epi)
			error = ep_remove(ep, epi);
		else
			error = -ENOENT;
		break;
	case EPOLL_CTL_MOD:
		if (epi) {
			if (!(epi->event.events & EPOLLEXCLUSIVE)) {
				epds.events |= EPOLLERR | EPOLLHUP;
				error = ep_modify(ep, epi, &epds);
			}
		} else
			error = -ENOENT;
		break;
	}
	if (tep != NULL)
		mutex_unlock(&tep->mtx);
	mutex_unlock(&ep->mtx);

error_tgt_fput:
	if (full_check)
		mutex_unlock(&epmutex);

	fdput(tf);
error_fput:
	fdput(f);
error_return:

	return error;
}

/*
 * 添加到eventpoll接口的每个文件描述符都将具有链接到“ rbr” RB树的此类型的条目。
 * 避免增加此结构的大小，服务器上可能有成千上万个这样的结构，我们不希望这占用另一个缓存行。
 */
struct epitem {
	union {
		/* RB树节点将此结构链接到eventpoll RB树 */
		struct rb_node rbn;
		/* 用于释放结构表位 */
		struct rcu_head rcu;
	};
	/* 列表头，用于将此结构链接到事件轮询就绪列表 */
	struct list_head rdllink;
	/*
	 * 在保持项目的单个链接链的同时，
	 * “ struct eventpoll”-> ovflist一起工作。
	 */
	struct epitem *next;
	/* 此项所指的文件描述符信息 */
	struct epoll_filefd ffd;
	/* 附加到轮询操作的活动等待队列数 */
	int nwait;
	/* 包含轮询等待队列的列表 */
	struct list_head pwqlist;
	/* 该项目的“容器” */
	struct eventpoll *ep;
	/* 用于将此项目链接到“结构文件”项目列表的列表头 */
	struct list_head fllink;
	/*设置EPOLLWAKEUP时使用的akeupup_source */
	struct wakeup_source __rcu *ws;
	/* 描述感兴趣事件和源fd的结构 */
	struct epoll_event event;
};

static LIST_HEAD(tfile_check_list);
//具有新添加的链接的文件列表，我们可能需要在其中限制发出路径的数量。 受epmutex保护。  全局列表

f = fdget(epfd);获取epfd的file指针
tf = fdget(fd);获取到需要操作的file指针
ep = f.file->private_data；获取eventpoll
epi = ep_find(ep, tf.file, fd); 从eventpoll总查看有没有操作文件描述符的epi

EPOLL_CTL_ADD

list_add(&tf.file->f_tfile_llink,
&tfile_check_list); 将目标file结构体添加到全局tfile_check_list表中。
找eventpoll的红黑树中查找这个文件和文件描述符在不在eventpoll的rbr数中
如果存在则反错，文件存在
否则将这个文件描述符添加到eventpoll error = ep_insert(ep, &epds, tf.file, fd, full_check);

EPOLL_CTL_DEl

ep_remove(ep, epi);将epi从ep中移除即可

EPOLL_CTL_MOD

error = ep_modify(ep, epi, &epds); 修改相应的epi 即可

epoll_wait系统调用

do_epoll_wait

/*
 * 为eventpoll文件实现事件等待界面。 
 * 它是用户空间epoll_wait（2）的内核部分。
 */
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
			 int maxevents, int timeout)
{
	int error;
	struct fd f;
	struct eventpoll *ep;

	/* 事件的最大数量必须大于零 */
	if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
		return -EINVAL;

	/* 验证用户传递的区域是否可写 */
	if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))
		return -EFAULT;

	/* 获取 eventpoll 文件的“ struct file * ” */
	/* 根据epfd找到对应的file */
	f = fdget(epfd);
	if (!f.file)
		return -EBADF;

	/*
	 * 我们必须检查用户传递给我们的fd下面的文件结构是一个eventpoll文件。
	 */
	error = -EINVAL;
	if (!is_file_epoll(f.file))
		goto error_fput;

	/*
	 * 此时，可以安全地假设“ private_data”包含我们自己的数据结构。
	 */
	 /* epoll_create1方法中把eventpoll实例放到了private_data字段中 */
	ep = f.file->private_data;

	/* 是时候钓鱼了 ... */
	error = ep_poll(ep, events, maxevents, timeout);

error_fput:
	fdput(f);
	return error;
}

SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
		int, maxevents, int, timeout)
{
	return do_epoll_wait(epfd, events, maxevents, timeout);
}

epfd为epoll_create1方法返回的fd，events为用户提供的 struct epoll_event 类型的数组，用于存放有监听事件发生的那些监听对象，maxevents 表示这个数组的长度，也表示epoll_wait方法最多可返回maxevents个事件就绪的监听对象。

ep_poll

/** fs/eventpoll.c
  * ep_poll-检索就绪事件，并将其传递给调用方提供的事件缓冲区。
  *
  * @ep：指向eventpoll上下文的指针。
  * @events：指向应该将就绪事件存储在其中的用户空间缓冲区的指针。
  * @maxevents：调用者事件缓冲区的大小（根据事件数）。
  * @timeout：准备事件获取操作的最大超时时间（以毫秒为单位）。 
  * 如果@timeout为零，则函数不会阻塞，而如果@timeout小于零，
  * 则函数将阻塞，直到至少检索到一个事件（或发生错误）。
  *
  *返回：返回已获取的就绪事件的数量，如果发生错误，则返回错误代码。
 */
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, long timeout)
{
	int res = 0, eavail, timed_out = 0;
	u64 slack = 0;
	wait_queue_entry_t wait;
	ktime_t expires, *to = NULL;

	lockdep_assert_irqs_enabled();

	if (timeout > 0) {
		struct timespec64 end_time = ep_set_mstimeout(timeout);

		slack = select_estimate_accuracy(&end_time);
		to = &expires;
		*to = timespec64_to_ktime(end_time);
	} else if (timeout == 0) {
		/*
		 * 如果调用方指定了非阻塞操作，则避免不必要的跳入等待队列循环。
		 */
		timed_out = 1;
		spin_lock_irq(&ep->wq.lock);
		goto check_events;
	}

fetch_events:

	if (!ep_events_available(ep))
		ep_busy_loop(ep, timed_out);

	spin_lock_irq(&ep->wq.lock);

	if (!ep_events_available(ep)) {
		/*
		 * 繁忙的轮询超时。 现在删除NAPI ID，
		 * 我们可以在将具有有效NAPI ID的套接字移至就绪列表后重新添加。
		 */
		ep_reset_busy_poll_napi_id(ep);

		/*
		 * 我们没有任何可用的事件可返回给调用者。 
		 * 我们需要在这里休眠，当事件可用时，我们会被ep_poll_callback()唤醒。
		 */
		init_waitqueue_entry(&wait, current);
		__add_wait_queue_exclusive(&ep->wq, &wait);

		for (;;) {
			/*
			 * 如果ep_poll_callback（）在这之间给我们唤醒，我们就不休眠。
			 * 这就是为什么我们在执行检查之前
			 * 将任务状态设置为TASK_INTERRUPTIBLE的原因。
			 */
			set_current_state(TASK_INTERRUPTIBLE);
			/*
			 * 始终使致命信号短路，以使线程及时退出，
			 * 而不会发现更多可用事件并重复获取。
			 */
			if (fatal_signal_pending(current)) {
				res = -EINTR;
				break;
			}
			if (ep_events_available(ep) || timed_out)
				break;
			if (signal_pending(current)) {
				res = -EINTR;
				break;
			}

			spin_unlock_irq(&ep->wq.lock);
			if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
				timed_out = 1;

			spin_lock_irq(&ep->wq.lock);
		}

		__remove_wait_queue(&ep->wq, &wait);
		__set_current_state(TASK_RUNNING);
	}
check_events:
	/* Is it worth to try to dig for events ? */
	eavail = ep_events_available(ep);

	spin_unlock_irq(&ep->wq.lock);

	/*
	 * 尝试将事件转移到用户空间。 万一我们收到0个事件，并且还有超时时间，
	 * 我们将再次尝试寻找更多的运气。
	 */
	if (!res && eavail &&
	    !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
		goto fetch_events;

	return res;
}

判断是否有监听事件就绪，如果有则直接调用ep_send_events方法把就绪对象拷贝到events里，然后返回。
如果没有，则先调用 init_waitqueue_entry 方法初始化wait变量，其中current参数为线程私有变量，线程相关的数据会放到这个变量中，同时，通过这个变量也能找到相应的线程。

struct wait_queue_entry {
	unsigned int		flags;
	void			*private;
	wait_queue_func_t	func;
	struct list_head	entry;
};
static inline void init_waitqueue_entry(struct wait_queue_entry *wq_entry, struct task_struct *p)
{
	wq_entry->flags		= 0;
	wq_entry->private	= p;
	wq_entry->func		= default_wake_function;
}
static inline void
__add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
	wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
	__add_wait_queue(wq_head, wq_entry);
}

wq_entry->func的 default_wake_function 方法就是用来唤醒当前进程current对应的线程的。
3. 初始化完wait变量之后，把它放到eventpoll的wq队列中，这个上面我们也有提到过。
4. 然后进入for循环，其逻辑为，检查是否有监听事件就绪，如果没有，则调用 schedule_hrtimeout_range 方法，使当前线程进入休眠状态。
5. 当各种情况，比如signal、timeout、监听事件发生，导致该线程被唤醒，则会再进入下一次for循环，并检查监听事件是否就绪，如果就绪了，则跳出for循环，同时把wait变量从eventpoll的wq队列中移除。
6. 调用 ep_send_events 方法把就绪事件的对象拷贝到用户提供的events数组中，然后返回。

ep_send_events

static int ep_send_events(struct eventpoll *ep,
			  struct epoll_event __user *events, int maxevents)
{
	struct ep_send_events_data esed;

	esed.maxevents = maxevents;
	esed.events = events;

	ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
	return esed.res;
}

ep_scan_ready_list的ep_send_events_proc参数是一个回调方法，在ep_scan_ready_list中调用

ep_scan_ready_list

/**
 *
 * ep_scan_ready_list-以一种可能的扫描代码方式扫描就绪列表，
 *    以调用f_op->poll()。 还允许O（NumReady）性能。
 * @ep：指向epoll私有数据结构的指针。
 * @sproc：指向扫描回调的指针。
 * @priv：传递给@sproc回调的私有不透明数据。
 * @depth：递归f_op-> poll调用的当前深度。
 * @ep_locked：呼叫者已经持有ep-> mtx 
 * 返回：由@sproc回调返回的相同整数错误代码。
 */
static __poll_t ep_scan_ready_list(struct eventpoll *ep,
			      __poll_t (*sproc)(struct eventpoll *,
					   struct list_head *, void *),
			      void *priv, int depth, bool ep_locked)
{
	__poll_t res;
	int pwake = 0;
	struct epitem *epi, *nepi;
	LIST_HEAD(txlist);

	lockdep_assert_irqs_enabled();

	/*
	 * We need to lock this because we could be hit by
	 * eventpoll_release_file() and epoll_ctl().
	 * 我们需要锁定它，因为我们可能会被eventpoll_release_file（）
	 * 和epoll_ctl（）击中。
	 */

	if (!ep_locked)
		mutex_lock_nested(&ep->mtx, depth);

	/*
	 * 偷走准备好的列表，然后将原始列表重新初始化为空白列表。
	 * 另外，将ep-> ovflist设置为NULL，
	 * 以使在丢失w / out锁时发生的事件不会丢失。 
	 * 我们不能使poll回调直接在ep-> rdllist上排队，
	 * 因为我们希望“ sproc”回调能够以无锁方式进行。
	 */
	spin_lock_irq(&ep->wq.lock);
	list_splice_init(&ep->rdllist, &txlist);
	ep->ovflist = NULL;
	spin_unlock_irq(&ep->wq.lock);

	/*
	 * 现在调用回调函数。
	 */
	res = (*sproc)(ep, &txlist, priv);

	spin_lock_irq(&ep->wq.lock);
	/*
	 * 在我们使用“ sproc”回调的过程中，
	 * 其他一些事件可能已由poll回调排队。
	 *  我们在这里将它们重新插入到主要准备列表中。
	 */
	for (nepi = ep->ovflist; (epi = nepi) != NULL;
	     nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
		/* 
		 * 我们需要检查项目是否已经在列表中。
		 *  在“ sproc”回调执行期间，项目会排队进入->ovflist，
		 * 但“ txlist”可能已包含它们，并且下面的list_splice（）会照顾它们。
		 */
		if (!ep_is_linked(epi)) {
			list_add_tail(&epi->rdllink, &ep->rdllist);
			ep_pm_stay_awake(epi);
		}
	}
	/*
	 * 我们需要将ep-> ovflist设置回EP_UNACTIVE_PTR，
	 * 以便在释放锁定后，事件将以常规方式在ep-> rdllist内部排队。
	 */
	ep->ovflist = EP_UNACTIVE_PTR;

	/*
	 * 快速重新注入留在“ txlist”上的项目。
	 */
	list_splice(&txlist, &ep->rdllist);
	__pm_relax(ep->ws);

	if (!list_empty(&ep->rdllist)) {
		/*
		 * 唤醒（如果激活）eventpoll等待列表和->poll（）等待列表（在释放锁定后延迟）。
		 */
		if (waitqueue_active(&ep->wq))
			wake_up_locked(&ep->wq);
		if (waitqueue_active(&ep->poll_wait))
			pwake++;
	}
	spin_unlock_irq(&ep->wq.lock);

	if (!ep_locked)
		mutex_unlock(&ep->mtx);

	/* We have to call this outside the lock */
	if (pwake)
		ep_poll_safewake(&ep->poll_wait);

	return res;
}

ep_scan_ready_list是将eventpoll中的rdllist列表内容转移到txlist列表中，同时把rdllist列表置为空，现在txlist就持有了所有有就绪事件的对象。
然后调用上面的回调方法 ep_send_events_proc，将该列表传入其中。

ep_send_events_proc

static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
			       void *priv)
{
	struct ep_send_events_data *esed = priv;
	__poll_t revents;
	struct epitem *epi;
	struct epoll_event __user *uevent;
	struct wakeup_source *ws;
	poll_table pt;

	init_poll_funcptr(&pt, NULL);

	/*
	 * 我们可以无锁循环，因为我们传递了任务私有列表。 
	 * 在此循环期间，项目无法消失，
	 * 因为ep_scan_ready_list（）在此调用期间保持“ mtx”。
	 */
	for (esed->res = 0, uevent = esed->events;
	     !list_empty(head) && esed->res < esed->maxevents;) {
		epi = list_first_entry(head, struct epitem, rdllink);

		/*
		 * 在停用epi-> ws之前先激活ep-> ws，
		 *  以防止在此处触发自动暂停（以防我们在下面对epi-> ws进行反应）。
		 * 
		 * 可以重新安排它以延迟epi->ws的停用，
		 *  但是epi->ws会暂时与ep_is_linked（）不同步。
		 */
		ws = ep_wakeup_source(epi);
		if (ws) {
			if (ws->active)
				__pm_stay_awake(ep->ws);
			__pm_relax(ws);
		}

		list_del_init(&epi->rdllink);

		revents = ep_item_poll(epi, &pt, 1);

		/*
		 * 如果事件掩码与调用方请求的掩码相交，请将事件传递到用户空间。 
		 * 同样，ep_scan_ready_list（）持有“ mtx”，因此，
		 * 来自用户空间的任何操作都不能更改该项目。
		 */
		if (revents) {
			if (__put_user(revents, &uevent->events) ||
			    __put_user(epi->event.data, &uevent->data)) {
				list_add(&epi->rdllink, head);
				ep_pm_stay_awake(epi);
				if (!esed->res)
					esed->res = -EFAULT;
				return 0;
			}
			esed->res++;
			uevent++;
			if (epi->event.events & EPOLLONESHOT)
				epi->event.events &= EP_PRIVATE_BITS;
			else if (!(epi->event.events & EPOLLET)) {
				/*
				 * If this file has been added with Level
				 * Trigger mode, we need to insert back inside
				 * the ready list, so that the next call to
				 * epoll_wait() will check again the events
				 * availability. At this point, no one can insert
				 * into ep->rdllist besides us. The epoll_ctl()
				 * callers are locked out by
				 * ep_scan_ready_list() holding "mtx" and the
				 * poll callback will queue them in ep->ovflist.
				 * 如果已使用“水平触发”模式添加了此文件，
				 * 则需要重新插入就绪列表中，
				 * 以便对epoll_wait（）的下一次调用将再次检查事件的可用性。
				 * 在这一点上，除了我们之外，没有人可以插入ep-> rdllist。
				 * 持有“ mtx”的ep_scan_ready_list（）
				 * 将epoll_ctl（）调用者锁定，
				 * 轮询回调将它们排队在ep->ovflist中。
				 * 
				 * 如果是 level-triggered，该对象还会被添加到就绪列表里
				 * 这样下次调用 epoll_wait 还会检查这个对象
				 */
				list_add_tail(&epi->rdllink, &ep->rdllist);
				ep_pm_stay_awake(epi);
			}
		}
	}

	return 0;
}

遍历head就绪列表中的所有对象，对其调用 ep_item_poll 方法，真正的去检查我们关心的那些事件是否存在。
如果有我们感兴趣的事件，则将该事件拷贝到用户event中。
如果该监听对象是 level-triggered 模式，则会把该对象再加入到就绪列表中，这样下次再调用 epoll_wait 方法，还会检查这些对象。
这也是 level-triggered 和 edge-triggered 在代码上表现出来的本质区别。
所有监听对象检查完毕后，此时满足条件的对象已经被拷贝到用户提供的events里，到这里方法就可以返回了。

ep_item_poll

/*
 * 与ep_eventpoll_poll（）的不同之处在于内部调用者已经具有ep-> mtx，
 * 因此我们需要从depth = 1开始，以便正确注释mutex_lock_nested（）。
 */
static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
				 int depth)
{
	struct eventpoll *ep;
	bool locked;

	pt->_key = epi->event.events;
	if (!is_file_epoll(epi->ffd.file))
		return vfs_poll(epi->ffd.file, pt) & epi->event.events;

	ep = epi->ffd.file->private_data;
	poll_wait(epi->ffd.file, &ep->poll_wait, pt);
	locked = pt && (pt->_qproc == ep_ptable_queue_proc);

	return ep_scan_ready_list(epi->ffd.file->private_data,
				  ep_read_events_proc, &depth, depth,
				  locked) & epi->event.events;
}

如果是epoll 则进行poll_wait即可

vfs_poll

static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
{
	if (unlikely(!file->f_op->poll))
		return DEFAULT_POLLMASK;
	return file->f_op->poll(file, pt);
}

如果是我们自定义或者是socket则执行vfs_poll
对于tcp socket对象，这个方法最终会调用 tcp_poll 方法，由于该方法涉及的都是tcp相关的内容

山羊哥-老宋

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
打赏
0
评论
非阻塞io之 epoll内核实现

epoll创建epollFdepollFD文件描述符的回调实现epoll_ctl 系统调用epoll_ctl的系统调用实现epoll_wait系统调用由于需要实现一个驱动想封装出epoll的设备，在网上找关于epoll的信息，好少所以自己看了内核的代码了解以下epoll机制，做个总结以便自己后面查看。/* * 这个结构存储在`struct file`结构的 "private_data"成员中，也是eventpoll接口的主要数据结构。 * 访问它受到 wq 内部锁的保护。 */struct ev
复制链接

扫一扫