概念
阻塞操作是指进程在用户层通过read/write等操作,如果不能获得资源,则挂起进程。直到等待的条件被满足,再唤醒进程执行操作
而非阻塞操作的进程在获取不到资源的情况,并不挂起,它要么放弃,要么不停地查询,直至可以进行操作为止。
前者在打开文件的时候没有O_NONBLOCK标记,后者使用O_NONBLOCK标记打开文件
阻塞IO
DECLARE_WAIT_QUEUE_HEAD(xxx_wait)
static ssize_t xxx_write(struct file *file, const char *buffer, size_t count,
loff_t *ppos)
{
...
DECLARE_WAITQUEUE(wait, current); /* 定义等待队列元素 */
add_wait_queue(&xxx_wait, &wait); /* 添加元素到等待队列 */
/* 等待设备缓冲区可写 */
do {
avail = device_writable(...);
if (avail < 0) {
if (file->f_flags &O_NONBLOCK) { /* 非阻塞 */
ret = -EAGAIN;
goto out;
}
__set_current_state(TASK_INTERRUPTIBLE); /* 改变进程状态 */
schedule(); /* 调度其他进程执行 */
if (signal_pending(current)) { /* 如果是因为信号唤醒 */
ret = -ERESTARTSYS;
goto out;
}
}
} while (avail < 0);
/* 写设备缓冲区 */
device_write(...)
out:
remove_wait_queue(&xxx_wait, &wait); /* 将元素移出xxx_wait指引的队列 */
set_current_state(TASK_RUNNING); /* 设置进程状态为TASK_RUNNING */
return ret;
}
非阻塞IO
select/poll
使用非阻塞I/O的应用程序通常会使用select/poll系统调用,以及epoll的相关接口,来查询是否可对设备进行无阻塞的访问;设备驱动提供poll函数
设备驱动的poll本身不会阻塞,但是select/poll系统调用,以及epoll的相关接口,则会阻塞地等待至少一个文件描述符集合可访问或超时;
static unsigned int xxx_poll(struct file *filp, poll_table *wait)
{
unsigned int mask = 0;
struct xxx_dev *dev = filp->private_data; /* 获得设备结构体指针*/
...
poll_wait(filp, &dev->r_wait, wait); /* 加入读等待队列 */
poll_wait(filp, &dev->w_wait, wait); /* 加入写等待队列 */
if (...) /* 可读 */
mask |= POLLIN | POLLRDNORM; /* 标示数据可获得(对用户可读)*/
if (...) /* 可写 */
mask |= POLLOUT | POLLWRNORM; /* 标示数据可写入*/
...
return mask;
}
系统调用如下
select/poll
sys_poll
do_sys_poll
do_poll
do_pollfd
vfs_poll(file->f_op->poll)
1.当超时时间为0,则不阻塞睡眠,直接返回;若大于0,则会睡眠到超时时间,然后返回;end_time为NULL,则会一直阻塞睡眠
2.do_pollfd调用vfs直接访问到file->fops->poll,通过返回值mask,判断有无读写标记,来标记fd是否可以操作
static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
struct timespec64 *end_time)
{
poll_table* pt = &wait->pt;
ktime_t expire, *to = NULL;
int timed_out = 0, count = 0;
u64 slack = 0;
__poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
unsigned long busy_start = 0;
/* Optimise the no-wait case */
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
pt->_qproc = NULL;
timed_out = 1;
}
if (end_time && !timed_out)
slack = select_estimate_accuracy(end_time);
for (;;) {
struct poll_list *walk;
bool can_busy_loop = false;
for (walk = list; walk != NULL; walk = walk->next) {
struct pollfd * pfd, * pfd_end;
pfd = walk->entries;
pfd_end = pfd + walk->len;
for (; pfd != pfd_end; pfd++) {
/*
* Fish for events. If we found one, record it
* and kill poll_table->_qproc, so we don't
* needlessly register any other waiters after
* this. They'll get immediately deregistered
* when we break out and return.
*/
if (do_pollfd(pfd, pt, &can_busy_loop,
busy_flag)) {
count++;
pt->_qproc = NULL;
/* found something, stop busy polling */
busy_flag = 0;
can_busy_loop = false;
}
}
}
/*
* All waiters have already been registered, so don't provide
* a poll_table->_qproc to them on the next loop iteration.
*/
pt->_qproc = NULL;
if (!count) {
count = wait->error;
if (signal_pending(current))
count = -ERESTARTNOHAND;
}
if (count || timed_out)
break;
/* only if found POLL_BUSY_LOOP sockets && not out of time */
if (can_busy_loop && !need_resched()) {
if (!busy_start) {
busy_start = busy_loop_current_time();
continue;
}
if (!busy_loop_timeout(busy_start))
continue;
}
busy_flag = 0;
/*
* If this is the first loop and we have a timeout
* given, then we convert to ktime_t and set the to
* pointer to the expiry value.
*/
if (end_time && !to) {
expire = timespec64_to_ktime(*end_time);
to = &expire;
}
if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
timed_out = 1;
}
return count;
}
应用层实例
#include <sys/types.h>
#include <sys/socket.h>
#include <stdio.h>
#include <netinet/in.h>
#include <sys/time.h>
#include <sys/ioctl.h>
#include <unistd.h>
#include <stdlib.h>
int main()
{
int server_sockfd, client_sockfd;
int server_len, client_len;
struct sockaddr_in server_address;
struct sockaddr_in client_address;
int result;
fd_set readfds, testfds;
server_sockfd = socket(AF_INET, SOCK_STREAM, 0);//建立服务器端socket
server_address.sin_family = AF_INET;
server_address.sin_addr.s_addr = htonl(INADDR_ANY);
server_address.sin_port = htons(8888);
server_len = sizeof(server_address);
bind(server_sockfd, (struct sockaddr *)&server_address, server_len);
listen(server_sockfd, 5); //监听队列最多容纳5个
FD_ZERO(&readfds);
FD_SET(server_sockfd, &readfds);//将服务器端socket加入到集合中
while(1)
{
char ch;
int fd;
int nread;
testfds = readfds;//将需要监视的描述符集copy到select查询队列中,select会对其修改,所以一定要分开使用变量
printf("server waiting %d\n", FD_SETSIZE);
/*无限期阻塞,并测试文件描述符变动 */
result = select(FD_SETSIZE, &testfds, (fd_set *)0,(fd_set *)0, (struct timeval *) 0); //FD_SETSIZE:系统默认的最大文件描述符
if(result < 1)
{
perror("server5");
exit(1);
}
printf("select server ready %d\n", result);
/*扫描所有的文件描述符*/
for(fd = 0; fd < FD_SETSIZE; fd++)
{
/*找到相关文件描述符*/
if(FD_ISSET(fd,&testfds))
{
/*判断是否为服务器套接字,是则表示为客户请求连接。*/
if(fd == server_sockfd)
{
client_len = sizeof(client_address);
client_sockfd = accept(server_sockfd,
(struct sockaddr *)&client_address, &client_len);
FD_SET(client_sockfd, &readfds);//将客户端socket加入到集合中
printf("server fd %d adding client on fd %d\n", server_sockfd, client_sockfd);
}
/*客户端socket中有数据请求时*/
else
{
ioctl(fd, FIONREAD, &nread);//取得数据量交给nread
/*客户数据请求完毕,关闭套接字,从集合中清除相应描述符 */
if(nread == 0)
{
close(fd);
FD_CLR(fd, &readfds); //去掉关闭的fd
printf("removing client on fd %d\n", fd);
}
/*处理客户数据请求*/
else
{
read(fd, &ch, 1);
sleep(5);
printf("serving client on fd %d\n", fd);
ch++;
write(fd, &ch, 1);
}
}
}
}
}
return 0;
}
epoll
epoll_create创建句柄后;
epoll_ctl把新的fd跟epoll句柄关联,并把新的fd拷贝进内核,然后插入红黑树,设置回调函数
1.然后会首次通过调用ep_item_poll来读取file->fops->poll,看看是否有读写标记
2.当文件描述符有变化,比如往fd写东西了,也会通过回调函数,调用到file->fops->poll来更新具体的标记
sys_epoll_ctl
ep_insert
static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
struct file *tfile, int fd, int full_check)
{
int error, pwake = 0;
__poll_t revents;
long user_watches;
struct epitem *epi;
struct ep_pqueue epq;
lockdep_assert_irqs_enabled();
user_watches = atomic_long_read(&ep->user->epoll_watches);
if (unlikely(user_watches >= max_user_watches))
return -ENOSPC;
if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
return -ENOMEM;
/* Item initialization follow here ... */
INIT_LIST_HEAD(&epi->rdllink);
INIT_LIST_HEAD(&epi->fllink);
INIT_LIST_HEAD(&epi->pwqlist);
epi->ep = ep;
ep_set_ffd(&epi->ffd, tfile, fd);
epi->event = *event;
epi->nwait = 0;
epi->next = EP_UNACTIVE_PTR;
if (epi->event.events & EPOLLWAKEUP) {
error = ep_create_wakeup_source(epi);
if (error)
goto error_create_wakeup_source;
} else {
RCU_INIT_POINTER(epi->ws, NULL);
}
/* Add the current item to the list of active epoll hook for this file */
spin_lock(&tfile->f_lock);
list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
spin_unlock(&tfile->f_lock);
/*
* Add the current item to the RB tree. All RB tree operations are
* protected by "mtx", and ep_insert() is called with "mtx" held.
*/
ep_rbtree_insert(ep, epi);
/* now check if we've created too many backpaths */
error = -EINVAL;
if (full_check && reverse_path_check())
goto error_remove_epi;
/* Initialize the poll table using the queue callback */
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
/*
* Attach the item to the poll hooks and get current event bits.
* We can safely use the file* here because its usage count has
* been increased by the caller of this function. Note that after
* this operation completes, the poll callback can start hitting
* the new item.
*/
revents = ep_item_poll(epi, &epq.pt, 1);
/*
* We have to check if something went wrong during the poll wait queue
* install process. Namely an allocation for a wait queue failed due
* high memory pressure.
*/
error = -ENOMEM;
if (epi->nwait < 0)
goto error_unregister;
/* We have to drop the new item inside our item list to keep track of it */
write_lock_irq(&ep->lock);
/* record NAPI ID of new item if present */
ep_set_busy_poll_napi_id(epi);
/* If the file is already "ready" we drop it inside the ready list */
if (revents && !ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
write_unlock_irq(&ep->lock);
atomic_long_inc(&ep->user->epoll_watches);
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(&ep->poll_wait);
return 0;
error_unregister:
ep_unregister_pollwait(ep, epi);
error_remove_epi:
spin_lock(&tfile->f_lock);
list_del_rcu(&epi->fllink);
spin_unlock(&tfile->f_lock);
rb_erase_cached(&epi->rbn, &ep->rbr);
/*
* We need to do this because an event could have been arrived on some
* allocated wait queue. Note that we don't care about the ep->ovflist
* list, since that is used/cleaned only inside a section bound by "mtx".
* And ep_insert() is called with "mtx" held.
*/
write_lock_irq(&ep->lock);
if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
write_unlock_irq(&ep->lock);
wakeup_source_unregister(ep_wakeup_source(epi));
error_create_wakeup_source:
kmem_cache_free(epi_cache, epi);
return error;
}
static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
int depth)
{
struct eventpoll *ep;
bool locked;
pt->_key = epi->event.events;
if (!is_file_epoll(epi->ffd.file))
return vfs_poll(epi->ffd.file, pt) & epi->event.events;
ep = epi->ffd.file->private_data;
poll_wait(epi->ffd.file, &ep->poll_wait, pt);
locked = pt && (pt->_qproc == ep_ptable_queue_proc);
return ep_scan_ready_list(epi->ffd.file->private_data,
ep_read_events_proc, &depth, depth,
locked) & epi->event.events;
}
epoll_wait在就绪链表中查看有没有就绪的fd
epoll_wait
do_epoll_wait
ep_poll
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, long timeout)
{
int res = 0, eavail, timed_out = 0;
u64 slack = 0;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
lockdep_assert_irqs_enabled();
if (timeout > 0) {
struct timespec64 end_time = ep_set_mstimeout(timeout);
slack = select_estimate_accuracy(&end_time);
to = &expires;
*to = timespec64_to_ktime(end_time);
} else if (timeout == 0) {
/*
* Avoid the unnecessary trip to the wait queue loop, if the
* caller specified a non blocking operation. We still need
* lock because we could race and not see an epi being added
* to the ready list while in irq callback. Thus incorrectly
* returning 0 back to userspace.
*/
timed_out = 1;
write_lock_irq(&ep->lock);
eavail = ep_events_available(ep);
write_unlock_irq(&ep->lock);
goto send_events;
}
fetch_events:
if (!ep_events_available(ep))
ep_busy_loop(ep, timed_out);
eavail = ep_events_available(ep);
if (eavail)
goto send_events;
/*
* Busy poll timed out. Drop NAPI ID for now, we can add
* it back in when we have moved a socket with a valid NAPI
* ID onto the ready list.
*/
ep_reset_busy_poll_napi_id(ep);
do {
/*
* Internally init_wait() uses autoremove_wake_function(),
* thus wait entry is removed from the wait queue on each
* wakeup. Why it is important? In case of several waiters
* each new wakeup will hit the next waiter, giving it the
* chance to harvest new event. Otherwise wakeup can be
* lost. This is also good performance-wise, because on
* normal wakeup path no need to call __remove_wait_queue()
* explicitly, thus ep->lock is not taken, which halts the
* event delivery.
*/
init_wait(&wait);
write_lock_irq(&ep->lock);
__add_wait_queue_exclusive(&ep->wq, &wait);
write_unlock_irq(&ep->lock);
/*
* We don't want to sleep if the ep_poll_callback() sends us
* a wakeup in between. That's why we set the task state
* to TASK_INTERRUPTIBLE before doing the checks.
*/
set_current_state(TASK_INTERRUPTIBLE);
/*
* Always short-circuit for fatal signals to allow
* threads to make a timely exit without the chance of
* finding more events available and fetching
* repeatedly.
*/
if (fatal_signal_pending(current)) {
res = -EINTR;
break;
}
eavail = ep_events_available(ep);
if (eavail)
break;
if (signal_pending(current)) {
res = -EINTR;
break;
}
if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
timed_out = 1;
break;
}
/* We were woken up, thus go and try to harvest some events */
eavail = 1;
} while (0);
__set_current_state(TASK_RUNNING);
if (!list_empty_careful(&wait.entry)) {
write_lock_irq(&ep->lock);
__remove_wait_queue(&ep->wq, &wait);
write_unlock_irq(&ep->lock);
}
send_events:
/*
* Try to transfer events to user space. In case we get 0 events and
* there's still timeout left over, we go trying again in search of
* more luck.
*/
if (!res && eavail &&
!(res = ep_send_events(ep, events, maxevents)) && !timed_out)
goto fetch_events;
return res;
}
应用层实例
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <sys/epoll.h>
#include <fcntl.h>
#define MAX_EVENTS 10
#define MAX_BUFFER_SIZE 1024
int main() {
int server_socket, client_socket;
struct sockaddr_in server_addr, client_addr;
socklen_t addr_size;
char buffer[MAX_BUFFER_SIZE];
struct epoll_event event, events[MAX_EVENTS];
// 创建套接字
server_socket = socket(AF_INET, SOCK_STREAM, 0);
if (server_socket < 0) {
perror("Error creating socket");
exit(EXIT_FAILURE);
}
// 设置服务器地址
server_addr.sin_family = AF_INET;
server_addr.sin_port = htons(6868);
//server_addr.sin_addr.s_addr = INADDR_ANY;
server_addr.sin_addr.s_addr = inet_addr("127.0.0.1"); //绑定127.0.0.1
memset(server_addr.sin_zero, '\0', sizeof(server_addr.sin_zero));
// 绑定套接字到地址
if (bind(server_socket, (struct sockaddr*)&server_addr, sizeof(server_addr)) < 0) {
perror("Error binding socket");
exit(EXIT_FAILURE);
}
// 监听
if (listen(server_socket, 5) < 0) {
perror("Error listening");
exit(EXIT_FAILURE);
}
// 创建epoll实例
int epoll_fd = epoll_create1(0);
if (epoll_fd < 0) {
perror("Error creating epoll instance");
exit(EXIT_FAILURE);
}
// 设置event结构体
event.events = EPOLLIN;
event.data.fd = server_socket;
// 将socket添加到epoll实例中
if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, server_socket, &event) < 0) {
perror("Error adding socket to epoll instance");
exit(EXIT_FAILURE);
}
while (1) {
int num_ready = epoll_wait(epoll_fd, events, MAX_EVENTS, -1);
if (num_ready < 0) {
perror("Error waiting for events");
exit(EXIT_FAILURE);
}
for (int i = 0; i < num_ready; i++) {
if (events[i].data.fd == server_socket) {
// 检测到新的客户端连接请求
addr_size = sizeof(client_addr);
client_socket = accept(server_socket, (struct sockaddr*)&client_addr, &addr_size);
// 设置client_socket为非阻塞
int flags = fcntl(client_socket, F_GETFL, 0);
fcntl(client_socket, F_SETFL, flags | O_NONBLOCK);
event.events = EPOLLIN | EPOLLET;
event.data.fd = client_socket;
// 将客户端socket添加到epoll实例中
if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, client_socket, &event) < 0) {
perror("Error adding client socket to epoll instance");
exit(EXIT_FAILURE);
}
printf("New client connected: %s\n", inet_ntoa(client_addr.sin_addr));
} else {
// 处理客户端发送的数据
int client_fd = events[i].data.fd;
memset(buffer, 0, MAX_BUFFER_SIZE);
int num_bytes = recv(client_fd, buffer, MAX_BUFFER_SIZE, 0);
if (num_bytes < 0) {
perror("Error receiving data");
close(client_fd);
continue;
} else if (num_bytes == 0) {
// 客户端连接关闭
printf("Client disconnected\n");
close(client_fd);
continue;
}
// 处理接收到的数据
printf("Received data from client: %s\n", buffer);
// 将数据发送回客户端
send(client_fd, buffer, num_bytes, 0);
}
}
}
// 关闭套接字和epoll实例
close(server_socket);
close(epoll_fd);
return 0;
}
总结
(1)select,poll实现需要自己不断轮询所有fd集合,直到设备就绪,期间可能要睡眠和唤醒多次交替。而epoll其实也需要调用epoll_wait不断轮询就绪链表,期间也可能多次睡眠和唤醒交替,但是它是设备就绪时,调用回调函数(ep_ptable_queue_proc),把就绪fd放入就绪链表中,并唤醒在epoll_wait中进入睡眠的进程。虽然都要睡眠和交替,但是select和poll在“醒着”的时候要遍历整个fd集合,而epoll在“醒着”的时候只要判断一下就绪链表是否为空就行了,这节省了大量的CPU时间。这就是回调机制带来的性能提升。
(2)select,poll每次调用都要把fd集合从用户态往内核态拷贝一次,并且要把current往设备等待队列中挂一次,而epoll只要一次拷贝,而且把current往等待队列上挂也只挂一次(在epoll_wait的开始,注意这里的等待队列并不是设备等待队列,只是一个epoll内部定义的等待队列)。这也能节省不少的开销