liburing
版本
系统内核版本:
Linux 5.15.0-107-generic #117~20.04.1-Ubuntu x86_64 GNU/Linux
源代码版本:
git@github.com:torvalds/linux.git v5.15
阅读入口
从最简单的代码看起,即阅读入口:examples/io_uring-test.c
此源文件内部调用liburing相关函数包含
io_uring_queue_init
、io_uring_get_sqe
、io_uring_prep_readv
、io_uring_submit
、io_uring_wait_cqe
、io_uring_cqe_seen
、io_uring_queue_exit
内核提供的函数仅有三个:io_uring_setup@425
、io_uring_enter@426
、io_uring_register@427
其中io_uring_queue_init
调用的内核函数为io_uring_setup
以此为起点,阅读内核代码。
liburing调用栈
// 示例程序调用代码
struct io_uring_params p;
memset(&p, 0, sizeof(p));
p.flags = 0;
struct io_uring ring;
io_uring_queue_init_params(64, &ring, &p);
int io_uring_queue_init_params(unsigned entries, struct io_uring *ring,
struct io_uring_params *p)
{
int ret;
ret = io_uring_queue_init_try_nosqarr(entries, ring, p, NULL, 0);
return ret >= 0 ? 0 : ret;
}
static int io_uring_queue_init_try_nosqarr(unsigned entries, struct io_uring *ring,
struct io_uring_params *p, void *buf,
size_t buf_size)
{
unsigned flags = p->flags;
int ret;
// 重点在__io_uring_queue_init_params函数
p->flags |= IORING_SETUP_NO_SQARRAY;
ret = __io_uring_queue_init_params(entries, ring, p, buf, buf_size);
// 5.15内核不支持IORING_SETUP_NO_SQARRAY, 故会返回 -EINVAL
// 所以真正执行的是下面的__io_uring_queue_init_params
if (ret != -EINVAL || (flags & IORING_SETUP_NO_SQARRAY))
return ret;
p->flags = flags;
return __io_uring_queue_init_params(entries, ring, p, buf, buf_size);
}
int __io_uring_queue_init_params(unsigned entries, struct io_uring *ring,
struct io_uring_params *p, void *buf,
size_t buf_size)
{
int fd, ret = 0;
unsigned *sq_array;
unsigned sq_entries, index;
memset(ring, 0, sizeof(*ring));
// 省去一些无法走到的代码
fd = __sys_io_uring_setup(entries, p);
if (fd < 0) {
return fd;
}
return fd
}
从__sys_io_uring_setup
进入内核代码
// 根据调用号425找到函数地址 io_uring_setup
// grep -rn "SYSCALL_DEFINE" $(find ./ -name '*.c')
static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
{
struct io_uring_params p;
int i;
// 将数据从用户空间拷贝到内核空间
if (copy_from_user(&p, params, sizeof(p)))
return -EFAULT;
for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
if (p.resv[i])
return -EINVAL;
}
if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
IORING_SETUP_R_DISABLED))
return -EINVAL;
return io_uring_create(entries, &p, params);
}
由于io_uring_create
篇幅过长,走不到的地方将会对其进行裁剪
static int io_uring_create(unsigned entries, struct io_uring_params *p,
struct io_uring_params *params)
{
// p为内核空间地址数据
// params为用户空间地址数据
struct io_ring_ctx *ctx;
struct file *file;
int ret;
// 此处检查提交队列SQ大小是否会超过最大限制IORING_MAX_ENTRIES(0x8000)
if (entries > IORING_MAX_ENTRIES) {
if (!(p->flags & IORING_SETUP_CLAMP))
return -EINVAL;
entries = IORING_MAX_ENTRIES;
}
// 保证队列容量是2的幂次方
p->sq_entries = roundup_pow_of_two(entries);
// 完成队列CQ是提交队列容量的两倍
p->cq_entries = 2 * p->sq_entries;
// 初始化io_ring_ctx对象
ctx = io_ring_ctx_alloc(p);
if (!ctx)
return -ENOMEM;
ctx->compat = in_compat_syscall();
if (!capable(CAP_IPC_LOCK))
ctx->user = get_uid(current_user());
mmgrab(current->mm);
ctx->mm_account = current->mm;
// 申请ctx->rings和ctx->sq_sqes内存
ret = io_allocate_scq_urings(ctx, p);
if (ret)
goto err;
// 设置了轮询后创建轮询线程函数, 由于参数未设置, 不在展开
ret = io_sq_offload_create(ctx, p);
if (ret)
goto err;
ret = io_rsrc_node_switch_start(ctx);
if (ret)
goto err;
io_rsrc_node_switch(ctx, NULL);
// 保存结构体成员偏移地址
// 用于用户空间创建内存映射时获取到偏移地址, 将用户空间结构体内部变量和内核空间结构体内部变量一一对应
// liburing io_uring_setup_ring_pointers
memset(&p->sq_off, 0, sizeof(p->sq_off));
/**
* 由于代码中设置了按照cacheline字节对齐
* 所以p->sq_off.tail = 64, 即虽然head是4字节, 但为了防止`错误共享`, 让其占据一个cacheline大小
* cacheline一般为64, 也可能是128, 博主PC是64的
* 错误共享知识可查看
* https://github.com/TonyBeen/study/blob/master/false_sharing/false_sharing.cc
*
* struct io_uring {
* u32 head __attribute__((__aligned__(64)));
* u32 tail __attribute__((__aligned__(64)));
* };
*/
p->sq_off.head = offsetof(struct io_rings, sq.head); // 0
p->sq_off.tail = offsetof(struct io_rings, sq.tail); // 64
p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); // 256
p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); // 264
p->sq_off.flags = offsetof(struct io_rings, sq_flags); // 276
p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); // 272
/**
* 具体细节在函数io_allocate_scq_urings内部, 如下
* 下面介绍如何计算得到2368的(代码在rings_size中)
*
* sizeof(struct io_rings) = 320 (64字节对齐结果)
* sizeof(struct io_uring_cqe) = 16
* cq_entries = 128
*
* p->sq_off.array = cq_entries * sizeof(struct io_uring_cqe) + sizeof(struct io_rings)
* 得到结果满足在cacheline字节边界上, 不满足的情况下不会是2368
*
*/
p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; // 2368
memset(&p->cq_off, 0, sizeof(p->cq_off));
p->cq_off.head = offsetof(struct io_rings, cq.head);
p->cq_off.tail = offsetof(struct io_rings, cq.tail);
p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
p->cq_off.cqes = offsetof(struct io_rings, cqes);
p->cq_off.flags = offsetof(struct io_rings, cq_flags);
p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
IORING_FEAT_RSRC_TAGS;
// 拷贝到用户空间
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;
goto err;
}
// 以下是创建一个匿名 inode, 并将文件描述符返回
// 可通过ll /proc/{pid}/fd/ 查看到io_uring的文件描述 3 -> 'anon_inode:[io_uring]'
// 用户可通过返回文件描述符进行mmap, 以获取对SQ/CQ的内存访问
file = io_uring_get_file(ctx);
if (IS_ERR(file)) {
ret = PTR_ERR(file);
goto err;
}
/*
* Install ring fd as the very last thing, so we don't risk someone
* having closed it before we finish setup
*/
ret = io_uring_install_fd(ctx, file);
if (ret < 0) {
/* fput will clean it up */
fput(file);
return ret;
}
trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
return ret;
err:
io_ring_ctx_wait_and_kill(ctx);
return ret;
}
static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
struct io_rings *rings;
size_t size, sq_array_offset;
/* make sure these are sane, as we already accounted them */
ctx->sq_entries = p->sq_entries;
ctx->cq_entries = p->cq_entries;
// rings_size计算的是一块存储struct io_rings + struct io_uring_cqe + sq_array的连续内存
// sq_array是ctx->sq_sqes的索引数组, 内存大小 = p->sq_entries * sizeof(u32) = 256
// sq_array_offset = 2368
// size = 2368 + 256 = 2624
size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
if (size == SIZE_MAX)
return -EOVERFLOW;
rings = io_mem_alloc(size);
if (!rings)
return -ENOMEM;
ctx->rings = rings;
ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
rings->sq_ring_mask = p->sq_entries - 1;
rings->cq_ring_mask = p->cq_entries - 1;
rings->sq_ring_entries = p->sq_entries;
rings->cq_ring_entries = p->cq_entries;
size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
if (size == SIZE_MAX) {
io_mem_free(ctx->rings);
ctx->rings = NULL;
return -EOVERFLOW;
}
ctx->sq_sqes = io_mem_alloc(size);
if (!ctx->sq_sqes) {
io_mem_free(ctx->rings);
ctx->rings = NULL;
return -ENOMEM;
}
return 0;
}
返回用户空间
int __io_uring_queue_init_params(unsigned entries, struct io_uring *ring,
struct io_uring_params *p, void *buf,
size_t buf_size)
{
int fd, ret = 0;
unsigned *sq_array;
unsigned sq_entries, index;
memset(ring, 0, sizeof(*ring));
// 省去一些无法走到的代码
fd = __sys_io_uring_setup(entries, p);
if (fd < 0) {
return fd;
}
// ------> 从此处执行 <------
// 未设置IORING_SETUP_NO_MMAP(不使用内存映射)标志
if (!(p->flags & IORING_SETUP_NO_MMAP)) {
// io_uring_queue_mmap透传, 直接看io_uring_mmap
ret = io_uring_queue_mmap(fd, p, ring);
if (ret) {
__sys_close(fd);
return ret;
}
}
return ret;
}
static int io_uring_mmap(int fd, struct io_uring_params *p,
struct io_uring_sq *sq, struct io_uring_cq *cq)
{
size_t size;
int ret;
size = sizeof(struct io_uring_cqe); // 16
// 通过gdb查看
/**
* p->sq_off.array = 2368
* p->sq_entries = 64
*
* p->cq_off.cqes = 320 // 固定320
* p->cq_entries = 128
*
* 计算出结果
* sq->ring_sz = 2624
* cq->ring_sz = 2368
*/
sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned);
cq->ring_sz = p->cq_off.cqes + p->cq_entries * size;
// fs/io_uring.c line:10328 会设置IORING_FEAT_SINGLE_MMAP标志
if (p->features & IORING_FEAT_SINGLE_MMAP) {
if (cq->ring_sz > sq->ring_sz)
sq->ring_sz = cq->ring_sz;
cq->ring_sz = sq->ring_sz;
}
/**
* sq->ring_sz = 2624
* cq->ring_sz = 2624
*/
// 创建与内核结构体io_rings的内存映射
sq->ring_ptr = __sys_mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE, fd,
IORING_OFF_SQ_RING);
if (IS_ERR(sq->ring_ptr))
return PTR_ERR(sq->ring_ptr);
// 共用一份数据
if (p->features & IORING_FEAT_SINGLE_MMAP) {
cq->ring_ptr = sq->ring_ptr;
}
size = sizeof(struct io_uring_sqe); // 64
// 创建提交队列的内存映射
// 此结构体数据用于给用户填充数据, 如通过io_uring_get_sqe获取一个结构体
// 并通过io_uring_prep_read填充此结构体, 最后通过io_uring_submit提交
// NOTE 需要注意的是, 通过io_uring_prep_read设置的数据必须保证内存生命周期在完成后
sq->sqes = __sys_mmap(0, size * p->sq_entries, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);
// 初始化提交队列和完成队列
io_uring_setup_ring_pointers(p, sq, cq);
return 0;
}