20.10轮询模式分析以及struct nvme_bdev_poll_group的具体作用
bdev初始化的过程中在nvme子系统:
spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_poll_group_create_cb,
bdev_nvme_poll_group_destroy_cb,
sizeof(struct nvme_bdev_poll_group), "bdev_nvme_poll_groups");
说明这个group是从属于g_nvme_bdev_ctrlrs这样一个iodevice的。
目前在spdk中存在多个group:bdev_aio_group_channel、nvme_bdev_poll_group、bdev_rbd_group_channel、bdev_uring_group_channel
这个group把不同类型的块设备进行了一个分类。
目前我们更关心的是nvme_bdev_poll_group:
他的定义:
struct nvme_bdev_poll_group {
struct spdk_nvme_poll_group *group;
struct spdk_poller *poller;
bool collect_spin_stat;
uint64_t spin_ticks;
uint64_t start_ticks;
uint64_t end_ticks;
#ifdef SPDK_CONFIG_APP_RW
uint64_t save_start_ticks;
uint64_t num_poll_timeout;
#endif
};
poller用来专门poll这个group,在bdev_nvme_poll_group_create_cb中注册的,我们使用的是非时延的poller
bdev_nvme_create_cb调用pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs);来从注册的io_device中创建属于线程的spdk_io_channel
每个io_device在每个spdk的thread中只可以有一个spdk_io_channel,第一次如果没有,则创建一个,ch->dev = dev,通过dev判断是否已经创建。
复用thread内部的spdk_io_channel话,ch->ref++,直接返回该ch
ch穿件过程中ch = calloc(1, sizeof(*ch) + dev->ctx_size); 其中dev->ctx_size就是注册的下层channel,不同的io_device注册会有不同的下层channel
在bdev层和nvme层注册的io_device有:
spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create,
bdev_mgmt_channel_destroy,
sizeof(struct spdk_bdev_mgmt_channel),
"bdev_mgr");
注册了一个类型为g_bdev_mgr,名为bdev_mgr的通用块层管理io_device,用于管理bdev设备
spdk_io_device_register(__bdev_to_io_dev(bdev),
bdev_channel_create, bdev_channel_destroy,
sizeof(struct spdk_bdev_channel),
bdev_name);
注册了一个类型为__bdev_to_io_dev(bdev),名为bdev_name的spdk_bdev_channel,用于下发读写等io
spdk_io_device_register(nvme_bdev_ctrlr, bdev_nvme_create_cb, bdev_nvme_destroy_cb,
sizeof(struct nvme_io_channel),
name);
注册了一个名为name的,为nvme_bdev_ctrlr类型的 io_device,用于nvme层的io操作
-------------------------------------------------------------------------------------------------------
spdk_io_channel的结构:
当调用spdk_get_io_channel时会创建并返回一个spdk_io_channel,它是属于每个spdk_thread内部的资源,外部访问
spdk_get_io_channel(void *io_device) 入参为io_device
1.遍历g_io_devices,如果没有找到直接退出
2.thread = _get_thread(); 必须为spdk_thread,且thread的状态不能是exit,否则退出
3.TAILQ_FOREACH(ch, &thread->io_channels, tailq) { 现在本地查找,如果找到,直接复用,ref++
4.创建一个新的ch = calloc(1, sizeof(*ch) + dev->ctx_size); ctx_size为注册时传入
5.rc = dev->create_cb(io_device, (uint8_t *)ch + sizeof(*ch)); 创建下属channel
void
spdk_io_device_register(void *io_device, spdk_io_channel_create_cb create_cb,
spdk_io_channel_destroy_cb destroy_cb, uint32_t ctx_size,
const char *name)
入参为:io_device、create_cb、delete_cb、ctx_size、name
1.thread = spdk_get_thread(); 必须在spdk_thread环境下执行
2.TAILQ_FOREACH(tmp, &g_io_devices, tailq) { 在g_io_devices中搜索,可以复用的设备不用重新register,防止重复
3.dev = calloc(1, sizeof(struct io_device)); 创建一个dev,赋值io_device、create_cb、delete_cb等关键信息
4.TAILQ_INSERT_TAIL(&g_io_devices, dev, tailq); 插入dev到g_io_devices中进行保存
void
spdk_put_io_channel(struct spdk_io_channel *ch) 入参为spdk_io_channel
1.thread = spdk_get_thread(); 必须在spdk_thread环境下执行
2.if (ch->thread != thread) { 必须让同线程处理
3.ch->ref--; 引用减一
4.if (ch->ref == 0) {
rc = spdk_thread_send_msg(thread, put_io_channel, ch); 如果引用数为0,则直接销毁
----------------------------------------------------------------------------------------
主要的调用和实现流程为:
1.intspdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 打开块设备,desc = calloc(1, sizeof(*desc));分配了一个desc
void *event_ctx, struct spdk_bdev_desc **_desc)
2.spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 入参为desc,创建io_channel资源
spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 传入类型为 __bdev_to_io_dev(bdev)
ch = calloc(1, sizeof(*ch) + dev->ctx_size);
TAILQ_INSERT_TAIL(&thread->io_channels, ch, tailq); ch插入thread->io_channels
rc = dev->create_cb(io_device, (uint8_t *)ch + sizeof(*ch));
bdev_channel_create
ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt);
bdev_nvme_get_io_channel(void *ctx)
struct nvme_bdev *nvme_bdev = ctx;
return spdk_get_io_channel(nvme_bdev->nvme_ns->ctrlr); 传入类型为struct nvme_bdev_ctrlr
ch = calloc(1, sizeof(*ch) + dev->ctx_size);
TAILQ_INSERT_TAIL(&thread->io_channels, ch, tailq); ch插入thread->io_channels
rc = dev->create_cb(io_device, (uint8_t *)ch + sizeof(*ch));
bdev_nvme_create_cb
ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts)); 分配qpair
pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 传入了g_nvme_bdev_ctrlrs
ch = calloc(
TAILQ_INSERT_TAIL(&thread->io_channels, ch, tailq); pg_ch插入thread->io_channels
rc = dev->create_cb(
bdev_nvme_poll_group_create_cb
struct nvme_bdev_poll_group *group = ctx_buf; 从ctx_buf取出预先创建的nvme_bdev_poll_group
group->group = spdk_nvme_poll_group_create(group);
struct spdk_nvme_poll_group *group; 创建了spdk_nvme_poll_group
group = calloc(1, sizeof(*group));
group->ctx = ctx; 把nvme_bdev_poll_group传入ctx
STAILQ_INIT(&group->tgroups); 初始化一个spdk_nvme_transport_poll_group的队列
group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 注册poller,bdev_nvme_poll为fn
if (spdk_nvme_poll_group_add(ch->group->group, ch->qpair) != 0) 做qpair->poll_group = tgroup; qpair->poll_group_tailq_head = &tgroup->disconnected_qpairs;
ch->group = spdk_io_channel_get_ctx(pg_ch); 把group传入到nvme_io_channel中存放
rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_bdev_ctrlr->ctrlr, ch->qpair);
rc = transport->ops.ctrlr_connect_qpair(ctrlr, qpair);
nvme_pcie_ctrlr_connect_qpair
_nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qpair->id); 创建qpair的cq和sq
nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTED); pair状态置为NVME_QPAIR_CONNECTED
rc = nvme_poll_group_connect_qpair(qpair);
rc = tgroup->transport->ops.poll_group_connect_qpair(qpair);
qpair->poll_group_tailq_head = &tgroup->connected_qpairs;
STAILQ_REMOVE(&tgroup->disconnected_qpairs, qpair, spdk_nvme_qpair, poll_group_stailq);
STAILQ_INSERT_TAIL(&tgroup->connected_qpairs, qpair, poll_group_stailq);
TAILQ_INIT(&ch->pending_resets); 初始化reset队列
mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr);
ch = calloc(1, sizeof(*ch) + dev->ctx_size);
TAILQ_INSERT_TAIL(&thread->io_channels, ch, tailq); ch插入thread->io_channels
rc = dev->create_cb(io_device, (uint8_t *)ch + sizeof(*ch));
bdev_mgmt_channel_create 创建mgmt_ch,做TAILQ_INIT(&ch->shared_resources);等...初始化队列资源
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
块层初始化以及子系统初始化:
spdk_subsystem_init
subsystem_sort 遍历g_subsystems_deps,如果没有找到则加进去,目标是在初始化一个subsystem时,必须先初始化他依赖的subsystem
目前bdev依赖的有:accel、vmd、sock
spdk_subsystem_init_next
g_next_subsystem->init() 一个个初始化subsystem,直接跳到bdev,这是一个递归调用
bdev_subsystem_initialize
spdk_bdev_initialize(bdev_initialize_complete, NULL);
g_bdev_mgr.bdev_io_pool = spdk_mempool_create 创建3个pool
g_bdev_mgr.buf_small_pool =
g_bdev_mgr.buf_large_pool =
g_bdev_mgr.zero_buffer = 创建一个buffer
spdk_io_device_register(&g_bdev_mgr 注册g_bdev_mgr为io_device,用于管理,传入size为spdk_bdev_mgmt_channel
rc = bdev_modules_init(); 初始化下属module
rc = module->module_init(); bdev下属module有:aio、iscsi、malloc、nvme、ocssd、uring等... 大概接近30个
bdev_nvme_library_init 只关心nvme
g_bdev_nvme_init_thread = spdk_get_thread(); 这里读取初始化的线程指针
spdk_io_device_register(&g_nvme_bdev_ctrlrs, 注册g_nvme_bdev_ctrlrs类型的io_device,把nvme_bdev_poll_group传入为size
bdev_module_action_complete();
----------------------------------------------------------------------------------------------------------------
块设备的资源分配和注册:
spdk_bdev_register 函数完成对块设备的注册
rc = spdk_bdev_register(&bdev->disk);
int rc = bdev_init(bdev);
spdk_bdev_get_by_name(bdev->name) 检查bdev是否已经被注册,如果已经存在,return -EEXIST;
bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 给bdev生成一个格式化的name,bdev_%s
bdev_start(bdev);
在ctrlr的init过程完毕后,ctrlr->state == NVME_CTRLR_STATE_READY 执行的attach流程
probe_ctx->attach_cb(probe_ctx->cb_ctx, &ctrlr->trid, ctrlr, &ctrlr->opts);
attach_cb
prchk_flags = ctx->prchk_flags[i];
name = strdup(ctx->names[i]); 在nvme_probe_ctx不为空的情况下,进行name和prchk_flags传入
nvme_bdev_ctrlr_create(ctrlr, name, trid, prchk_flags); 开始创建nvme_bdev_ctrlr
nvme_bdev_ctrlr = calloc(1, sizeof(*nvme_bdev_ctrlr));
nvme_bdev_ctrlr->num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
nvme_bdev_ctrlr->namespaces = calloc(nvme_bdev_ctrlr->num_ns, sizeof(struct nvme_bdev_ns *)); 支持创建多个namespace,分配的是二维数组的资源
trid_entry = calloc(1, sizeof(*trid_entry)); 创建了一个struct nvme_bdev_ctrlr_trid
trid_entry->trid = *trid;
nvme_bdev_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_bdev_ns)); 为每一个namespace分配资源
nvme_bdev_ctrlr->thread = spdk_get_thread(); nvme_bdev_ctrlr属于这个thread
nvme_bdev_ctrlr->ctrlr = ctrlr; 关联到spdk_nvme_ctrlr
nvme_bdev_ctrlr->name = strdup(name);
nvme_bdev_ctrlr->prchk_flags = prchk_flags; 传入name和prchk_flags
spdk_io_device_register(nvme_bdev_ctrlr, bdev_nvme_create_cb, bdev_nvme_destroy_cb, 注册io_device
sizeof(struct nvme_io_channel),
name);
TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq); 用g_nvme_bdev_ctrlrs来管理nvme_bdev_ctrlr
nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
nvme_ctrlr_populate_namespace(nvme_bdev_ctrlr, ns, ctx);
nvme_ctrlr_populate_standard_namespace
bdev = calloc(1, sizeof(*bdev));
bdev->disk.name = spdk_sprintf_alloc("%sn%d", nvme_bdev_ctrlr->name, spdk_nvme_ns_get_id(ns));
bdev->disk.ctxt = bdev;
rc = spdk_bdev_register(&bdev->disk);
int rc = bdev_init(bdev);
spdk_bdev_get_by_name(bdev->name) 检查bdev是否已经被注册,如果已经存在,return -EEXIST;
bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 给bdev生成一个格式化的name,bdev_%s
bdev->internal.status = SPDK_BDEV_STATUS_READY;
spdk_io_device_register(__bdev_to_io_dev(bdev),
bdev_channel_create, bdev_channel_destroy,
sizeof(struct spdk_bdev_channel),
bdev_name);
bdev_start(bdev);
TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link);
bdev_examine(bdev);
nvme_bdev_attach_bdev_to_ns(nvme_ns, bdev);
nvme_ns->ctrlr->ref++;
TAILQ_INSERT_TAIL(&nvme_ns->bdevs, nvme_disk, tailq);
nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, 0);
ns->populated = true;
ns->ctrlr->ref++;
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
spdk-20.10 io_channel 和 轮询 group的机制分析
最新推荐文章于 2025-03-07 16:10:18 发布