----------
kernel version : 2.6.32.60
----------
I.aio
Asynchronous I/O帮助用户程序提高CPU和IO设备的利用率和提高程序性能,特别是在高负载的IO操作下。比如各种代理服务器,数据库,流服务器等等。
AIO可以一次性发出大量的read/write调用并且通过通用块层的IO调度来获得更好的性能,用户程序也可以减少过多的同步负载,还可以在业务逻辑中更灵活的进行并发控制和负载均衡。另外相对于其他实现如用户多线程后台同步,Glibc等实现也减少了线程的负载和上下文切换。
AIO是横架于整个内核的接口,它把所有的IO包括(本地设备,网络,管道等)以统一的异步接口提供给用户程序,每个子系统都针对接口实现自己的异步方案,而同步IO(Synchronous IO)只是在内核内部的”AIO+Blocking”.
以read为例来看"同步IO在内核实现为AIO+Blocking"
fs/read_write.c
ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = buf, .iov_len = len };
struct kiocb kiocb;
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
kiocb.ki_left = len;
for (;;) {
ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
if (ret != -EIOCBRETRY)
break;
wait_on_retry_sync_kiocb(&kiocb);
}
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
return ret;
}
EXPORT_SYMBOL(do_sync_read);
ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
if (!(file->f_mode & FMODE_READ))
return -EBADF;
if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
return -EINVAL;
if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
return -EFAULT;
ret = rw_verify_area(READ, file, pos, count);
if (ret >= 0) {
count = ret;
if (file->f_op->read)
ret = file->f_op->read(file, buf, count, pos);
else
ret = do_sync_read(file, buf, count, pos);
if (ret > 0) {
fsnotify_access(file->f_path.dentry);
add_rchar(current, ret);
}
inc_syscr(current);
}
return ret;
}
EXPORT_SYMBOL(vfs_read);
由以上代码可以看出,如果VFS提供同步read方法,则使用同步read方法;否则使用AIO+Blocking的do_sync_read
II.aio结构
aio的主要流程:
io_setup创建aio上下文
io_submit提交aio操作,并将aio上下文放入到work_queue中异步完成提交的aio操作
work_queue完成aio操作后将其入到事件回环缓存中,供io_getevents读取
III.io_setup
io_setup主要用于创建aio上下文:
1.初始化kioctx结构
2.分配Ring Buffer,用于存储完成AIO操作的事件
/* sys_io_setup:
* Create an aio_context capable of receiving at least nr_events.
* ctxp must not point to an aio_context that already exists, and
* must be initialized to 0 prior to the call. On successful
* creation of the aio_context, *ctxp is filled in with the resulting
* handle. May fail with -EINVAL if *ctxp is not initialized,
* if the specified nr_events exceeds internal limits. May fail
* with -EAGAIN if the specified nr_events exceeds the user's limit
* of available events. May fail with -ENOMEM if insufficient kernel
* resources are available. May fail with -EFAULT if an invalid
* pointer is passed for ctxp. Will fail with -ENOSYS if not
* implemented.
*/
SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
{
struct kioctx *ioctx = NULL;
unsigned long ctx;
long ret;
ret = get_user(ctx, ctxp);
if (unlikely(ret))
goto out;
ret = -EINVAL;
if (unlikely(ctx || nr_events == 0)) {
pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n",
ctx, nr_events);
goto out;
}
ioctx = ioctx_alloc(nr_events);
ret = PTR_ERR(ioctx);
if (!IS_ERR(ioctx)) {
ret = put_user(ioctx->user_id, ctxp);
if (!ret)
return 0;
get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */
io_destroy(ioctx);
}
out:
return ret;
}
根据事件数创建aio上下文,并将user_id返回给用户空间,以后根据user_id使用该aio上下文
/* ioctx_alloc
* Allocates and initializes an ioctx. Returns an ERR_PTR if it failed.