驱动的阻塞与非阻塞IO

概念

阻塞操作是指进程在用户层通过read/write等操作,如果不能获得资源,则挂起进程。直到等待的条件被满足,再唤醒进程执行操作

而非阻塞操作的进程在获取不到资源的情况,并不挂起,它要么放弃,要么不停地查询,直至可以进行操作为止。

前者在打开文件的时候没有O_NONBLOCK标记,后者使用O_NONBLOCK标记打开文件

阻塞IO

在设备驱动中使用等待队列的模版,在进行写 I/O 操作的时候,判断设备是否 可写,如果不可写且为阻塞I/O ,则进程睡眠并挂起到等待队列
DECLARE_WAIT_QUEUE_HEAD(xxx_wait)
static ssize_t xxx_write(struct file *file, const char *buffer, size_t count,
loff_t *ppos)
{
	...
	DECLARE_WAITQUEUE(wait, current); /* 定义等待队列元素 */
	add_wait_queue(&xxx_wait, &wait); /* 添加元素到等待队列 */

	/* 等待设备缓冲区可写 */
	do {
		avail = device_writable(...);
		if (avail < 0) {
			if (file->f_flags &O_NONBLOCK) { /* 非阻塞 */
			ret = -EAGAIN;
			goto out;
		}
		__set_current_state(TASK_INTERRUPTIBLE); /* 改变进程状态 */
		schedule(); /* 调度其他进程执行 */
		if (signal_pending(current)) { /* 如果是因为信号唤醒 */
			ret = -ERESTARTSYS;
			goto out;
		 }
		}
	} while (avail < 0);

	/* 写设备缓冲区 */
	device_write(...)
	out:
	remove_wait_queue(&xxx_wait, &wait); /* 将元素移出xxx_wait指引的队列 */
	set_current_state(TASK_RUNNING); /* 设置进程状态为TASK_RUNNING */
	return ret;
}

非阻塞IO

select/poll

使用非阻塞I/O的应用程序通常会使用select/poll系统调用,以及epoll的相关接口,来查询是否可对设备进行无阻塞的访问;设备驱动提供poll函数

设备驱动的poll本身不会阻塞,但是select/poll系统调用,以及epoll的相关接口,则会阻塞地等待至少一个文件描述符集合可访问或超时;

static unsigned int xxx_poll(struct file *filp, poll_table *wait)
{
	unsigned int mask = 0;
	struct xxx_dev *dev = filp->private_data; /* 获得设备结构体指针*/

	...
	poll_wait(filp, &dev->r_wait, wait); /* 加入读等待队列 */
	poll_wait(filp, &dev->w_wait, wait); /* 加入写等待队列 */

	 if (...) /* 可读 */
	 mask |= POLLIN | POLLRDNORM; /* 标示数据可获得(对用户可读)*/

	 if (...) /* 可写 */
	 mask |= POLLOUT | POLLWRNORM; /* 标示数据可写入*/
	 ...
	 return mask;
}

系统调用如下

select/poll
	sys_poll
		do_sys_poll
			do_poll
				do_pollfd
					vfs_poll(file->f_op->poll)

1.当超时时间为0,则不阻塞睡眠,直接返回;若大于0,则会睡眠到超时时间,然后返回;end_time为NULL,则会一直阻塞睡眠

2.do_pollfd调用vfs直接访问到file->fops->poll,通过返回值mask,判断有无读写标记,来标记fd是否可以操作

static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
		   struct timespec64 *end_time)
{
	poll_table* pt = &wait->pt;
	ktime_t expire, *to = NULL;
	int timed_out = 0, count = 0;
	u64 slack = 0;
	__poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
	unsigned long busy_start = 0;

	/* Optimise the no-wait case */
	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
		pt->_qproc = NULL;
		timed_out = 1;
	}

	if (end_time && !timed_out)
		slack = select_estimate_accuracy(end_time);

	for (;;) {
		struct poll_list *walk;
		bool can_busy_loop = false;

		for (walk = list; walk != NULL; walk = walk->next) {
			struct pollfd * pfd, * pfd_end;

			pfd = walk->entries;
			pfd_end = pfd + walk->len;
			for (; pfd != pfd_end; pfd++) {
				/*
				 * Fish for events. If we found one, record it
				 * and kill poll_table->_qproc, so we don't
				 * needlessly register any other waiters after
				 * this. They'll get immediately deregistered
				 * when we break out and return.
				 */
				if (do_pollfd(pfd, pt, &can_busy_loop,
					      busy_flag)) {
					count++;
					pt->_qproc = NULL;
					/* found something, stop busy polling */
					busy_flag = 0;
					can_busy_loop = false;
				}
			}
		}
		/*
		 * All waiters have already been registered, so don't provide
		 * a poll_table->_qproc to them on the next loop iteration.
		 */
		pt->_qproc = NULL;
		if (!count) {
			count = wait->error;
			if (signal_pending(current))
				count = -ERESTARTNOHAND;
		}
		if (count || timed_out)
			break;

		/* only if found POLL_BUSY_LOOP sockets && not out of time */
		if (can_busy_loop && !need_resched()) {
			if (!busy_start) {
				busy_start = busy_loop_current_time();
				continue;
			}
			if (!busy_loop_timeout(busy_start))
				continue;
		}
		busy_flag = 0;

		/*
		 * If this is the first loop and we have a timeout
		 * given, then we convert to ktime_t and set the to
		 * pointer to the expiry value.
		 */
		if (end_time && !to) {
			expire = timespec64_to_ktime(*end_time);
			to = &expire;
		}

		if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
			timed_out = 1;
	}
	return count;
}
应用层实例
#include <sys/types.h> 
#include <sys/socket.h> 
#include <stdio.h> 
#include <netinet/in.h> 
#include <sys/time.h> 
#include <sys/ioctl.h> 
#include <unistd.h> 
#include <stdlib.h>
 
int main() 
{ 
    int server_sockfd, client_sockfd; 
    int server_len, client_len; 
    struct sockaddr_in server_address; 
    struct sockaddr_in client_address; 
    int result; 
    fd_set readfds, testfds; 
    server_sockfd = socket(AF_INET, SOCK_STREAM, 0);//建立服务器端socket 
    server_address.sin_family = AF_INET; 
    server_address.sin_addr.s_addr = htonl(INADDR_ANY); 
    server_address.sin_port = htons(8888); 
    server_len = sizeof(server_address); 
    bind(server_sockfd, (struct sockaddr *)&server_address, server_len); 
    listen(server_sockfd, 5); //监听队列最多容纳5个 
    FD_ZERO(&readfds); 
    FD_SET(server_sockfd, &readfds);//将服务器端socket加入到集合中
    while(1) 
    {
        char ch; 
        int fd; 
        int nread; 
        testfds = readfds;//将需要监视的描述符集copy到select查询队列中,select会对其修改,所以一定要分开使用变量 
        printf("server waiting %d\n", FD_SETSIZE); 
 
        /*无限期阻塞,并测试文件描述符变动 */
        result = select(FD_SETSIZE, &testfds, (fd_set *)0,(fd_set *)0, (struct timeval *) 0); //FD_SETSIZE:系统默认的最大文件描述符
        if(result < 1) 
        { 
            perror("server5"); 
            exit(1); 
        }
 
		printf("select server ready %d\n", result); 
        /*扫描所有的文件描述符*/
        for(fd = 0; fd < FD_SETSIZE; fd++) 
        {
            /*找到相关文件描述符*/
            if(FD_ISSET(fd,&testfds))
            { 
              /*判断是否为服务器套接字,是则表示为客户请求连接。*/
                if(fd == server_sockfd) 
                { 
                    client_len = sizeof(client_address); 
                    client_sockfd = accept(server_sockfd, 
                    (struct sockaddr *)&client_address, &client_len); 
                    FD_SET(client_sockfd, &readfds);//将客户端socket加入到集合中
                    printf("server fd %d adding client on fd %d\n", server_sockfd, client_sockfd); 
                }
                /*客户端socket中有数据请求时*/
                else 
                { 
                    ioctl(fd, FIONREAD, &nread);//取得数据量交给nread
                    
                    /*客户数据请求完毕,关闭套接字,从集合中清除相应描述符 */
                    if(nread == 0) 
                    { 
                        close(fd); 
                        FD_CLR(fd, &readfds); //去掉关闭的fd
                        printf("removing client on fd %d\n", fd); 
                    } 
                    /*处理客户数据请求*/
                    else 
                    { 
                        read(fd, &ch, 1); 
                        sleep(5); 
                        printf("serving client on fd %d\n", fd);
                        ch++; 
                        write(fd, &ch, 1); 
                    }
                } 
            } 
        } 
    } 
 
    return 0;
}

epoll

epoll_create创建句柄后;

epoll_ctl把新的fd跟epoll句柄关联,并把新的fd拷贝进内核,然后插入红黑树,设置回调函数

1.然后会首次通过调用ep_item_poll来读取file->fops->poll,看看是否有读写标记

2.当文件描述符有变化,比如往fd写东西了,也会通过回调函数,调用到file->fops->poll来更新具体的标记

sys_epoll_ctl
    ep_insert
static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
		     struct file *tfile, int fd, int full_check)
{
	int error, pwake = 0;
	__poll_t revents;
	long user_watches;
	struct epitem *epi;
	struct ep_pqueue epq;

	lockdep_assert_irqs_enabled();

	user_watches = atomic_long_read(&ep->user->epoll_watches);
	if (unlikely(user_watches >= max_user_watches))
		return -ENOSPC;
	if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
		return -ENOMEM;

	/* Item initialization follow here ... */
	INIT_LIST_HEAD(&epi->rdllink);
	INIT_LIST_HEAD(&epi->fllink);
	INIT_LIST_HEAD(&epi->pwqlist);
	epi->ep = ep;
	ep_set_ffd(&epi->ffd, tfile, fd);
	epi->event = *event;
	epi->nwait = 0;
	epi->next = EP_UNACTIVE_PTR;
	if (epi->event.events & EPOLLWAKEUP) {
		error = ep_create_wakeup_source(epi);
		if (error)
			goto error_create_wakeup_source;
	} else {
		RCU_INIT_POINTER(epi->ws, NULL);
	}

	/* Add the current item to the list of active epoll hook for this file */
	spin_lock(&tfile->f_lock);
	list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
	spin_unlock(&tfile->f_lock);

	/*
	 * Add the current item to the RB tree. All RB tree operations are
	 * protected by "mtx", and ep_insert() is called with "mtx" held.
	 */
	ep_rbtree_insert(ep, epi);

	/* now check if we've created too many backpaths */
	error = -EINVAL;
	if (full_check && reverse_path_check())
		goto error_remove_epi;

	/* Initialize the poll table using the queue callback */
	epq.epi = epi;
	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

	/*
	 * Attach the item to the poll hooks and get current event bits.
	 * We can safely use the file* here because its usage count has
	 * been increased by the caller of this function. Note that after
	 * this operation completes, the poll callback can start hitting
	 * the new item.
	 */
	revents = ep_item_poll(epi, &epq.pt, 1);

	/*
	 * We have to check if something went wrong during the poll wait queue
	 * install process. Namely an allocation for a wait queue failed due
	 * high memory pressure.
	 */
	error = -ENOMEM;
	if (epi->nwait < 0)
		goto error_unregister;

	/* We have to drop the new item inside our item list to keep track of it */
	write_lock_irq(&ep->lock);

	/* record NAPI ID of new item if present */
	ep_set_busy_poll_napi_id(epi);

	/* If the file is already "ready" we drop it inside the ready list */
	if (revents && !ep_is_linked(epi)) {
		list_add_tail(&epi->rdllink, &ep->rdllist);
		ep_pm_stay_awake(epi);

		/* Notify waiting tasks that events are available */
		if (waitqueue_active(&ep->wq))
			wake_up(&ep->wq);
		if (waitqueue_active(&ep->poll_wait))
			pwake++;
	}

	write_unlock_irq(&ep->lock);

	atomic_long_inc(&ep->user->epoll_watches);

	/* We have to call this outside the lock */
	if (pwake)
		ep_poll_safewake(&ep->poll_wait);

	return 0;

error_unregister:
	ep_unregister_pollwait(ep, epi);
error_remove_epi:
	spin_lock(&tfile->f_lock);
	list_del_rcu(&epi->fllink);
	spin_unlock(&tfile->f_lock);

	rb_erase_cached(&epi->rbn, &ep->rbr);

	/*
	 * We need to do this because an event could have been arrived on some
	 * allocated wait queue. Note that we don't care about the ep->ovflist
	 * list, since that is used/cleaned only inside a section bound by "mtx".
	 * And ep_insert() is called with "mtx" held.
	 */
	write_lock_irq(&ep->lock);
	if (ep_is_linked(epi))
		list_del_init(&epi->rdllink);
	write_unlock_irq(&ep->lock);

	wakeup_source_unregister(ep_wakeup_source(epi));

error_create_wakeup_source:
	kmem_cache_free(epi_cache, epi);

	return error;
}

static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
				 int depth)
{
	struct eventpoll *ep;
	bool locked;

	pt->_key = epi->event.events;
	if (!is_file_epoll(epi->ffd.file))
		return vfs_poll(epi->ffd.file, pt) & epi->event.events;

	ep = epi->ffd.file->private_data;
	poll_wait(epi->ffd.file, &ep->poll_wait, pt);
	locked = pt && (pt->_qproc == ep_ptable_queue_proc);

	return ep_scan_ready_list(epi->ffd.file->private_data,
				  ep_read_events_proc, &depth, depth,
				  locked) & epi->event.events;
}

epoll_wait在就绪链表中查看有没有就绪的fd

epoll_wait
	do_epoll_wait
		ep_poll
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
		   int maxevents, long timeout)
{
	int res = 0, eavail, timed_out = 0;
	u64 slack = 0;
	wait_queue_entry_t wait;
	ktime_t expires, *to = NULL;

	lockdep_assert_irqs_enabled();

	if (timeout > 0) {
		struct timespec64 end_time = ep_set_mstimeout(timeout);

		slack = select_estimate_accuracy(&end_time);
		to = &expires;
		*to = timespec64_to_ktime(end_time);
	} else if (timeout == 0) {
		/*
		 * Avoid the unnecessary trip to the wait queue loop, if the
		 * caller specified a non blocking operation. We still need
		 * lock because we could race and not see an epi being added
		 * to the ready list while in irq callback. Thus incorrectly
		 * returning 0 back to userspace.
		 */
		timed_out = 1;

		write_lock_irq(&ep->lock);
		eavail = ep_events_available(ep);
		write_unlock_irq(&ep->lock);

		goto send_events;
	}

fetch_events:

	if (!ep_events_available(ep))
		ep_busy_loop(ep, timed_out);

	eavail = ep_events_available(ep);
	if (eavail)
		goto send_events;

	/*
	 * Busy poll timed out.  Drop NAPI ID for now, we can add
	 * it back in when we have moved a socket with a valid NAPI
	 * ID onto the ready list.
	 */
	ep_reset_busy_poll_napi_id(ep);

	do {
		/*
		 * Internally init_wait() uses autoremove_wake_function(),
		 * thus wait entry is removed from the wait queue on each
		 * wakeup. Why it is important? In case of several waiters
		 * each new wakeup will hit the next waiter, giving it the
		 * chance to harvest new event. Otherwise wakeup can be
		 * lost. This is also good performance-wise, because on
		 * normal wakeup path no need to call __remove_wait_queue()
		 * explicitly, thus ep->lock is not taken, which halts the
		 * event delivery.
		 */
		init_wait(&wait);
		write_lock_irq(&ep->lock);
		__add_wait_queue_exclusive(&ep->wq, &wait);
		write_unlock_irq(&ep->lock);

		/*
		 * We don't want to sleep if the ep_poll_callback() sends us
		 * a wakeup in between. That's why we set the task state
		 * to TASK_INTERRUPTIBLE before doing the checks.
		 */
		set_current_state(TASK_INTERRUPTIBLE);
		/*
		 * Always short-circuit for fatal signals to allow
		 * threads to make a timely exit without the chance of
		 * finding more events available and fetching
		 * repeatedly.
		 */
		if (fatal_signal_pending(current)) {
			res = -EINTR;
			break;
		}

		eavail = ep_events_available(ep);
		if (eavail)
			break;
		if (signal_pending(current)) {
			res = -EINTR;
			break;
		}

		if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
			timed_out = 1;
			break;
		}

		/* We were woken up, thus go and try to harvest some events */
		eavail = 1;

	} while (0);

	__set_current_state(TASK_RUNNING);

	if (!list_empty_careful(&wait.entry)) {
		write_lock_irq(&ep->lock);
		__remove_wait_queue(&ep->wq, &wait);
		write_unlock_irq(&ep->lock);
	}

send_events:
	/*
	 * Try to transfer events to user space. In case we get 0 events and
	 * there's still timeout left over, we go trying again in search of
	 * more luck.
	 */
	if (!res && eavail &&
	    !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
		goto fetch_events;

	return res;
}

应用层实例

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <sys/epoll.h>
#include <fcntl.h>
 
#define MAX_EVENTS 10
#define MAX_BUFFER_SIZE 1024
 
int main() {
    int server_socket, client_socket;
    struct sockaddr_in server_addr, client_addr;
    socklen_t addr_size;
    char buffer[MAX_BUFFER_SIZE];
    struct epoll_event event, events[MAX_EVENTS];
 
    // 创建套接字
    server_socket = socket(AF_INET, SOCK_STREAM, 0);
    if (server_socket < 0) {
        perror("Error creating socket");
        exit(EXIT_FAILURE);
    }
 
    // 设置服务器地址
    server_addr.sin_family = AF_INET;
    server_addr.sin_port = htons(6868);
    //server_addr.sin_addr.s_addr = INADDR_ANY;
    server_addr.sin_addr.s_addr = inet_addr("127.0.0.1");  //绑定127.0.0.1
 
    memset(server_addr.sin_zero, '\0', sizeof(server_addr.sin_zero));
 
    // 绑定套接字到地址
    if (bind(server_socket, (struct sockaddr*)&server_addr, sizeof(server_addr)) < 0) {
        perror("Error binding socket");
        exit(EXIT_FAILURE);
    }
 
    // 监听
    if (listen(server_socket, 5) < 0) {
        perror("Error listening");
        exit(EXIT_FAILURE);
    }
 
    // 创建epoll实例
    int epoll_fd = epoll_create1(0);
    if (epoll_fd < 0) {
        perror("Error creating epoll instance");
        exit(EXIT_FAILURE);
    }
 
    // 设置event结构体
    event.events = EPOLLIN;
    event.data.fd = server_socket;
 
    // 将socket添加到epoll实例中
    if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, server_socket, &event) < 0) {
        perror("Error adding socket to epoll instance");
        exit(EXIT_FAILURE);
    }
 
    while (1) {
        int num_ready = epoll_wait(epoll_fd, events, MAX_EVENTS, -1);
        if (num_ready < 0) {
            perror("Error waiting for events");
            exit(EXIT_FAILURE);
        }
 
        for (int i = 0; i < num_ready; i++) {
            if (events[i].data.fd == server_socket) {
                // 检测到新的客户端连接请求
                addr_size = sizeof(client_addr);
                client_socket = accept(server_socket, (struct sockaddr*)&client_addr, &addr_size);
 
                // 设置client_socket为非阻塞
                int flags = fcntl(client_socket, F_GETFL, 0);
                fcntl(client_socket, F_SETFL, flags | O_NONBLOCK);
 
                event.events = EPOLLIN | EPOLLET;
                event.data.fd = client_socket;
 
                // 将客户端socket添加到epoll实例中
                if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, client_socket, &event) < 0) {
                    perror("Error adding client socket to epoll instance");
                    exit(EXIT_FAILURE);
                }
 
                printf("New client connected: %s\n", inet_ntoa(client_addr.sin_addr));
            } else {
                // 处理客户端发送的数据
                int client_fd = events[i].data.fd;
                memset(buffer, 0, MAX_BUFFER_SIZE);
 
                int num_bytes = recv(client_fd, buffer, MAX_BUFFER_SIZE, 0);
                if (num_bytes < 0) {
                    perror("Error receiving data");
                    close(client_fd);
                    continue;
                } else if (num_bytes == 0) {
                    // 客户端连接关闭
                    printf("Client disconnected\n");
                    close(client_fd);
                    continue;
                }
 
                // 处理接收到的数据
                printf("Received data from client: %s\n", buffer);
 
                // 将数据发送回客户端
                send(client_fd, buffer, num_bytes, 0);
            }
        }
    }
 
    // 关闭套接字和epoll实例
    close(server_socket);
    close(epoll_fd);
 
    return 0;
}

总结

(1)select,poll实现需要自己不断轮询所有fd集合,直到设备就绪,期间可能要睡眠和唤醒多次交替。而epoll其实也需要调用epoll_wait不断轮询就绪链表,期间也可能多次睡眠和唤醒交替,但是它是设备就绪时,调用回调函数(ep_ptable_queue_proc),把就绪fd放入就绪链表中,并唤醒在epoll_wait中进入睡眠的进程。虽然都要睡眠和交替,但是select和poll在“醒着”的时候要遍历整个fd集合,而epoll在“醒着”的时候只要判断一下就绪链表是否为空就行了,这节省了大量的CPU时间。这就是回调机制带来的性能提升。

(2)select,poll每次调用都要把fd集合从用户态往内核态拷贝一次,并且要把current往设备等待队列中挂一次,而epoll只要一次拷贝,而且把current往等待队列上挂也只挂一次(在epoll_wait的开始,注意这里的等待队列并不是设备等待队列,只是一个epoll内部定义的等待队列)。这也能节省不少的开销

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值