从epoll源码分析它的使用

最新推荐文章于 2024-01-02 14:43:40 发布

SoulNov23

最新推荐文章于 2024-01-02 14:43:40 发布

阅读量1.5k

点赞数

分类专栏： linux内核网络编程 epoll/libev/libevent 文章标签： c语言 epoll 源码原理多线程

本文链接：https://blog.csdn.net/peng314899581/article/details/69389965

版权

网络编程同时被 3 个专栏收录

21 篇文章 0 订阅

订阅专栏

epoll/libev/libevent

7 篇文章 1 订阅

订阅专栏

linux内核

2 篇文章 0 订阅

订阅专栏

首先来看看epoll_create的真身

SYSCALL_DEFINE1(epoll_create, int, size)
{
	if (size &lt;= 0)
	return -EINVAL;
	//也就是说参数size根本用不上
	return sys_epoll_create1(0);
}

再来看看epoll_create1的真身

SYSCALL_DEFINE1(epoll_create1, int, flags)
{
	int error, fd;
	struct eventpoll *ep = NULL;
	struct file *file;
	
	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
	if (flags & ~EPOLL_CLOEXEC)
		return -EINVAL;

	error = ep_alloc(&ep);
	if (error &lt; 0)
		return error;
		
	fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
	if (fd &lt; 0) {
		error = fd;
		goto out_free_ep;
	}
	file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, O_RDWR | (flags & O_CLOEXEC));
	if (IS_ERR(file)) {
		error = PTR_ERR(file);
		goto out_free_fd;
	}
	ep->file = file;
	fd_install(fd, file);
	return fd;

out_free_fd:
	put_unused_fd(fd);
out_free_ep:
	ep_free(ep);
	return error;
}

1. 对epoll来讲，目前唯一有效的flag只有EPOLL_CLOEXEC
2. ep_alloc初始化spinlock_t锁，mutex锁
3. 每次epoll_create1一个epollfd，内核就会分配一个eventpoll 与之对应
struct eventpoll
{
spinlock_t lock;

//添加，修改，删除fd，epoll_wait返回，内核态向用户态传递数据时都会持有这个锁，所以多线程操作epoll是安全的，内核做了保护
struct mutex mtx;

/* Wait queue used by sys_epoll_wait()*/
wait_queue_head_t wq;

/* Wait queue used by file->poll() */
wait_queue_head_t poll_wait;

//所有触发的epitem都放在这个链表里面
struct list_head rdllist;

//红黑树的root节点，所有要监听的epitem都在这个红黑树中，我们可以把红黑树的所有节点都看作epitem
struct rb_root rbr;

This is a single linked list that chains all the “struct epitem” that
happened while transferring ready events to userspace w/out
holding ->lock.
*/
struct epitem *ovflist;

/* wakeup_source used when ep_scan_ready_list is running */
struct wakeup_source *ws;

/* The user that created the eventpoll descriptor */
struct user_struct *user;

struct file *file;

/* used to optimize loop detection check */
int visited;
struct list_head visited_list_link;
};
3. 因为epollfd本身不存在一个真正的文件与之对应，不像socket，所以内核会分配一个真正的file结构且有真正的fd，然后和epollfd对应
struct file{
//eventpoll存储在这里
void private_data;
struct list_head f_ep_links;
};
这样，通过epollfd找到它在内核中的file，然后通过file找到了存储的eventpoll
4. struct epitem {
/ RB tree node used to link this structure to the eventpoll RB tree */
struct rb_node rbn;

//当这个节点触发的时候，会链到之前提到的eventpoll中的rdllist中去
struct list_head rdllink;

Works together “struct eventpoll”->ovflist in keeping the
single linked chain of items.
*/
struct epitem *next;

//epitem对应的fd和真正的file
struct epoll_filefd ffd;

/* Number of active wait queue attached to poll operations */
int nwait;

/* List containing poll wait queues */
struct list_head pwqlist;

//epitem属于的eventpoll
struct eventpoll *ep;

/* List header used to link this item to the “struct file” items list */
struct list_head fllink;

/* wakeup_source used when EPOLLWAKEUP is set */
struct wakeup_source *ws;

/* The structure that describe the interested events and the source fd */
//epitem关心的事件
struct epoll_event event;
};
struct epoll_filefd{
struct file *file;
int fd;
};
再来看看epoll_ctl的真身

SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
		struct epoll_event __user *, event)
{
	int error;
	int did_lock_epmutex = 0;
	struct file *file, *tfile;
	struct eventpoll *ep;
	struct epitem *epi;
	struct epoll_event epds;

    error = -EFAULT;
	if (ep_op_has_event(op) &&
	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
		goto error_return;

	/* Get the "struct file *" for the eventpoll file */
	error = -EBADF;
	//这里就是之前说的通过epollfd找到对应的file，后续会通过这个file找到eventpoll
	file = fget(epfd);
	if (!file)
		goto error_return;

	/* Get the "struct file *" for the target file */
	tfile = fget(fd);
	if (!tfile)
		goto error_fput;

	/* The target file descriptor must support poll */
	error = -EPERM;
	if (!tfile->f_op || !tfile->f_op->poll)
		goto error_tgt_fput;

	/* Check if EPOLLWAKEUP is allowed */
	if ((epds.events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
		epds.events &= ~EPOLLWAKEUP;

	/*
	 * We have to check that the file structure underneath the file descriptor
	 * the user passed to us _is_ an eventpoll file. And also we do not permit
	 * adding an epoll file descriptor inside itself.
	 */
	error = -EINVAL;
	//epoll不能监听自己
	if (file == tfile || !is_file_epoll(file))
		goto error_tgt_fput;

	/*
	 * At this point it is safe to assume that the "private_data" contains
	 * our own data structure.
	 */
	 //这里就是通过file找到对应的eventpoll
	ep = file->private_data;

	/*
	 * When we insert an epoll file descriptor, inside another epoll file
	 * descriptor, there is the change of creating closed loops, which are
	 * better be handled here, than in more critical paths. While we are
	 * checking for loops we also determine the list of files reachable
	 * and hang them on the tfile_check_list, so we can check that we
	 * haven't created too many possible wakeup paths.
	 *
	 * We need to hold the epmutex across both ep_insert and ep_remove
	 * b/c we want to make sure we are looking at a coherent view of
	 * epoll network.
	 */
	if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
		mutex_lock(&epmutex);
		did_lock_epmutex = 1;
	}
	if (op == EPOLL_CTL_ADD) {
		if (is_file_epoll(tfile)) {
			error = -ELOOP;
			if (ep_loop_check(ep, tfile) != 0) {
				clear_tfile_check_list();
				goto error_tgt_fput;
			}
		} else
			list_add(&tfile->f_tfile_llink, &tfile_check_list);
	}

	mutex_lock_nested(&ep->mtx, 0);

	/*
	 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
	 * above, we can be sure to be able to use the item looked up by
	 * ep_find() till we release the mutex.
	 */
	 //我们在接口层面知道一个fd只能添加一次，这里对应到红黑树中是epitem
	epi = ep_find(ep, tfile, fd);

	error = -EINVAL;
	switch (op) {
	case EPOLL_CTL_ADD:
		if (!epi) {
			epds.events |= POLLERR | POLLHUP;
			error = ep_insert(ep, &epds, tfile, fd);
		} else
			error = -EEXIST;
		clear_tfile_check_list();
		break;
	case EPOLL_CTL_DEL:
		if (epi)
			error = ep_remove(ep, epi);
		else
			error = -ENOENT;
		break;
	case EPOLL_CTL_MOD:
		if (epi) {
			epds.events |= POLLERR | POLLHUP;
			error = ep_modify(ep, epi, &epds);
		} else
			error = -ENOENT;
		break;
	}
	mutex_unlock(&ep->mtx);

error_tgt_fput:
	if (did_lock_epmutex)
		mutex_unlock(&epmutex);

	fput(tfile);
error_fput:
	fput(file);
error_return:

	return error;
}

这里我们可以很清楚的看到EPOLL_CTL_ADD，EPOLL_CTL_DEL，EPOLL_CTL_MOD操作都是有加锁保护的，ep_insert使用了spinlock_t 锁，内部首先是查看eventpoll中user成员，查看给的最大监听数量，然后再分配一个epitem，并设置回调ep_ptable_queue_proc,也就是红黑树的节点epitem有事件触发就调用这个回调。这个回调将触发的epitem放到waitqueue中,并设置了回调ep_poll_callback，这个waitqueue是fd所持有的。然后这个回调内部将触发的epitem放到了之前说的eventpoll的rdllist中。最后我们的epoll_wait就是遍历这个rdllist，如果有事件触发，就开始从内核态拷贝数据给用户态，这里也使用了spinlock_t锁。拷贝完之后的操作，在这里还设置了ET和LT的区别，如果是ET，epitem是不会再进入到rdllist，除非fd再次发生了状态改变，ep_poll_callback被调用。如果是LT，不管你还有没有激活的事件或者有效的数据，都会被重新插入到rdllist，再下一次epoll_wait的时候又返回给你。
总结：

我们不是一定非要在主线程中listen之后完成accept，recv然后把数据丢给工作线程池。因为在多线程中EPOLL_CTL_ADD，EPOLL_CTL_DEL，EPOLL_CTL_MOD都是安全的，我们完全可以让线程池来代替主线程做accep，recv，当然这个线程池应该是CPU密集的，数量最好是CPU核数。这样主线程只做一件事情监听就行了，连接管理就交给这个线程池来做，最后数据处理还是给工作线程池。
对比select，每次调用select时都要把fd集合从用户态拷贝到内核态，每次都要重复拷贝，而epoll只是在EPOLL_CTL_ADD调用了一次，也就是只拷贝了一次
对比select，每次调用select的返回都需要在内核遍历传进来的fd集合，而epoll内部是通过红黑树结构查找速度更快，并且触发的事件都会通过回调函数放到rdllist，而epoll_wait返回仅仅只是从rdllist拿已经触发的事件。select和epoll都会睡眠和唤醒的状态切换，但是select在唤醒的时候需要去遍历，而epoll只需要判断链表是否为空，也节约了CPU消耗
对比select，select支持的文件描述符默认是1024，就算修改配置后面遍历的速度也会越来越慢没有红黑树快。而epoll支持的文件描述符是一个进程能够打开的最大文件描述符数目1G内存大概可以提供10万
联系著名的“惊群”现象，多线程中epoll_wait会不会因为同一个fd的事件触发而触发了多个线程去处理？由于epoll_wait从rdllist拿事件是加锁了的，所以不会。

SoulNov23

关注

0
点赞
踩
13

收藏

觉得还不错? 一键收藏
1
评论
从epoll源码分析它的使用

首先来看看epoll_create的真身SYSCALL_DEFINE1(epoll_create, int, size){ if (size <= 0) return -EINVAL; //也就是说参数size根本用不上 return sys_epoll_create1(0);}再来看看epoll_create1的真身SYSCALL_DEFINE1(epoll_c
复制链接

扫一扫