IO模式 epoll详解

最新推荐文章于 2023-06-03 21:16:30 发布

置顶云车干

最新推荐文章于 2023-06-03 21:16:30 发布

阅读量469

点赞数 1

文章标签： epoll linux

原文链接：https://mp.weixin.qq.com/s/OmRdUgO1guMX76EdZn11UQ

版权

Github-Linux源码
 参考链接一
 参考链接二
 参考链接三
 参考链接四

文章目录

文件描述符fd
IO模式
select、epoll对比
epoll详解

复用技术：在单条信道上处理多个时间，用单个线程处理多个事件。
这里所说的IO主要指的是网络IO，在Linux中一切皆文件，因此网络IO常用文件描述符fd表示。

文件描述符fd

用于指向某个文件引用的抽象概念。实际上fd是一个非负整数，用于一个进程维护的文件数组的索引。当程序创建或打开一个文件时，内核会向进程返回一个文件描述符。

IO模式

在Linux的缓存IO中，IO的数据会先拷贝到内核的缓存区域，然后才会从内核缓冲区拷贝到进程的地址空间。

阻塞IO
当用户进程调用recvfrom函数时，就会进入阻塞状态，此时内核会将IO的数据拷贝入内核，等到IO数据准备完成，会将数据拷贝到用户空间，然后进程才会取消Block。
非阻塞IO
主动轮询：当用户进程发出read操作，如果此时内核中的数据没准备好，就会返回一个error，对用户进程来说，永远都会收到回复，只需要判断是不是error，是的话就在发送read操作。
IO多路复用
本质还是阻塞IO，用户进程调用select就会进入阻塞，内核监听事件。区别在于可以用一个进程同时处理多个socket。
异步IO
用户进程发起read，就可以做其他事情，内核收到sync read就会准备数据，准备好后给用户进程返回一个signal。

select、epoll对比

用户态将文件拷贝入内核态的方式
select：创建三个描述符集合并拷贝入内核，分别监听读、写、异常动作。受到单个进程（select线程）可以打开fd数量限制，默认是1024；
epoll：执行epoll_create函数会在内核的高速cache区建立红黑树和就绪链表(存储已经就绪的文件描述符fd)。接着用户执行的epoll_ctl函数添加文件描述符会在红黑树中增加节点；
内核检测文件描述符读写状态的方式
select：select采用轮询的方式，遍历所有fd；
epoll：采用回调机制。在执行epoll_ctl的add操作时，不仅会在红黑树增加节点，而且注册了回调函数，内核在检测到某个fd可读/可写时，会调用回调函数，该回调函数会将fd放入就绪链表；
找到就绪的fd返回给用户态的方式
select：将之前传入到内核的fd_set传出到用户态，并返回就绪的fd数量。此时用户态不知道哪些fd就绪，需要遍历来判断，发现一个就将返回的就绪fd数量减一；
epoll： epoll_wait函数观察就绪链表有没有fd，将链表的数据传入数组并返回就绪数量。用户态只需要遍历数组挨个处理就行；这里返回的fd是通过mmap让内核和用户态共享一块内存区域，减少了不必要的拷贝。

epoll详解

epoll_create会创建一个eventpoll对象，包含红黑树(所有的socket连接)、就绪链表、阻塞的进程；

// 从github抠的
// eventpoll结构体
struct eventpoll {
	...
	
	/* Wait queue used by sys_epoll_wait() */
	// 软中断数据就绪时，会通过wq来找阻塞在epoll对象上的用户进程
	wait_queue_head_t wq;

	/* Wait queue used by file->poll() */
	wait_queue_head_t poll_wait;

	/* List of ready file descriptors */
	// 就绪链表
	struct list_head rdllist;

	/* RB tree root used to store monitored fd structs */
	// 红黑树，添加所有sockt连接
	struct rb_root_cached rbr;

	// 当前进程打开的文件，进行关联
	struct file *file;
	
	...
};

// 创建eventpoll、file并进行关联
static int do_epoll_create(int flags)
{
	int error, fd;
	struct eventpoll *ep = NULL;
	struct file *file;
	···
}

进程在打开一个文件时，会创建一个file结构，表示打开文件的上下文，然后进程会通过int类型的fd来访问这个file，实际上进程维护了一个file数组，fd就是这个数组的下标。调用epoll_ctl时会传入该fd，找到epoll对应的文件，找到eventpoll对象。
在这里插入图片描述

epoll_ctl调用步骤

初始化一个红黑树节点epitem对象
初始化socket等待队列，

struct epitem {

    //红黑树节点
    struct rb_node rbn;

    //socket文件描述符信息
    struct epoll_filefd ffd;

    //所归属的 eventpoll 对象
    struct eventpoll *ep;

    //等待队列
    struct list_head pwqlist;
}

static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
		     struct file *tfile, int fd, int full_check)
{
	int error, pwake = 0;
	__poll_t revents;
	long user_watches;
	struct epitem *epi;
	struct ep_pqueue epq;
	struct eventpoll *tep = NULL;

	if (is_file_epoll(tfile))
		tep = tfile->private_data;

	lockdep_assert_irqs_enabled();

	user_watches = atomic_long_read(&ep->user->epoll_watches);
	if (unlikely(user_watches >= max_user_watches))
		return -ENOSPC;
	// 分配epitem监听项
	if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL)))
		return -ENOMEM;

	/* Item initialization follow here ... */
	INIT_LIST_HEAD(&epi->rdllink);
	epi->ep = ep;
	ep_set_ffd(&epi->ffd, tfile, fd);
	epi->event = *event;
	epi->next = EP_UNACTIVE_PTR;

	if (tep)
		mutex_lock_nested(&tep->mtx, 1);
	/* Add the current item to the list of active epoll hook for this file */
	if (unlikely(attach_epitem(tfile, epi) < 0)) {
		kmem_cache_free(epi_cache, epi);
		if (tep)
			mutex_unlock(&tep->mtx);
		return -ENOMEM;
	}

	if (full_check && !tep)
		list_file(tfile);

	atomic_long_inc(&ep->user->epoll_watches);

	// 加入eventpoll的红黑树
	ep_rbtree_insert(ep, epi);
	if (tep)
		mutex_unlock(&tep->mtx);

	/* 初始化内核需要调用的回调函数 */
	epq.epi = epi;
	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

	// 注册回调函数，目标对象有事件就会调用回调函数ep_ptable_queue_proc
	revents = ep_item_poll(epi, &epq.pt, 1);

	/*
	 * We have to check if something went wrong during the poll wait queue
	 * install process. Namely an allocation for a wait queue failed due
	 * high memory pressure.
	 */
	if (unlikely(!epq.epi)) {
		ep_remove(ep, epi);
		return -ENOMEM;
	}

	/* We have to drop the new item inside our item list to keep track of it */
	write_lock_irq(&ep->lock);

	/* record NAPI ID of new item if present */
	ep_set_busy_poll_napi_id(epi);

	/* If the file is already "ready" we drop it inside the ready list */
	if (revents && !ep_is_linked(epi)) {
		list_add_tail(&epi->rdllink, &ep->rdllist);
		ep_pm_stay_awake(epi);

		/* Notify waiting tasks that events are available */
		if (waitqueue_active(&ep->wq))
			wake_up(&ep->wq);
		if (waitqueue_active(&ep->poll_wait))
			pwake++;
	}

	write_unlock_irq(&ep->lock);

	/* We have to call this outside the lock */
	if (pwake)
		ep_poll_safewake(ep, NULL);

	return 0;
}

假设创建了两个socket和epoll对象，他们对应的文件分别为5000、5001、5002
在这里插入图片描述

epoll_wait被调用时观察到就绪链表有数据就返回，没有数据就创建一个等待列表，将该列表加入到epollevent的就绪列表，阻塞自己。

static int do_epoll_wait(int epfd, struct epoll_event __user *events,
			 int maxevents, struct timespec64 *to)
{
	int error;
	struct fd f;
	struct eventpoll *ep;

	/* The maximum number of event must be greater than zero */
	if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
		return -EINVAL;

	/* Verify that the area passed by the user is writeable */
	if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
		return -EFAULT;

	/* 获取eventpoll的文件 */
	f = fdget(epfd);
	if (!f.file)
		return -EBADF;

	// 获取eventpoll
	ep = f.file->private_data;

	// 探听事件
	error = ep_poll(ep, events, maxevents, to);

error_fput:
	fdput(f);
	return error;
}

在这里插入图片描述

云车干

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
IO模式 epoll详解

参考链接一参考链接二复用技术：在单条信道上处理多个时间，用单个线程处理多个事件。这里所说的IO主要指的是网络IO，在Linux中一切皆文件，因此网络IO常用文件描述符fd表示。select、epoll对比用户态将文件拷贝入内核态的方式select：创建三个描述符集合并拷贝入内核，分别监听读、写、异常动作。受到单个进程（select线程）可以打开fd数量限制，默认是1024；epoll：执行epoll_create函数会在内核的高速cache区建立红黑树和就绪链表(存储已经就绪的文件描述符
复制链接

扫一扫