ceph中bluestore在user space 提供block device,bypass文件系统由ceph直接管理各种设备。
ceph中的块设备都是blockdevice的子类
BlockDevice *BlockDevice::create(CephContext* cct, const string& path,
aio_callback_t cb, void *cbpriv)
{
#if defined(HAVE_PMEM)
if (type == "kernel") {
int is_pmem = 0;
void *addr = pmem_map_file(path.c_str(), 1024*1024, PMEM_FILE_EXCL, O_RDONLY, NULL, &is_pmem);
if (addr != NULL) {
if (is_pmem)
type = "pmem";
pmem_unmap(addr, 1024*1024);
}
}
#endif
#if defined(HAVE_PMEM)
if (type == "pmem") {
return new PMEMDevice(cct, cb, cbpriv);
}
#endif
if (type == "kernel") {
return new KernelDevice(cct, cb, cbpriv);
}
#if defined(HAVE_SPDK)
if (type == "ust-nvme") {
return new NVMEDevice(cct, cb, cbpriv);
}
#endif
}
从这里可以看出ceph支持kernel/pmem/nvme 这三类设备,其中kernel 这个block设备由于sas 等常用的hdd
int KernelDevice::open(const string& p)
{
path = p;
int r = 0;
dout(1) << __func__ << " path " << path << dendl;
#以direct io的方式打开这个块设备
fd_direct = ::open(path.c_str(), O_RDWR | O_DIRECT);
if (fd_direct < 0) {
r = -errno;
derr << __func__ << " open got: " << cpp_strerror(r) << dendl;
return r;
}
#以buffer io 也就是异步的方式打开这个块设备
fd_buffered = ::open(path.c_str(), O_RDWR);
if (fd_buffered < 0) {
r = -errno;
derr << __func__ << " open got: " << cpp_strerror(r) << dendl;
goto out_direct;
}
这里以aio_write为例
int KernelDevice::aio_write(
uint64_t off,
bufferlist &bl,
IOContext *ioc,
bool buffered)
{
#ifdef HAVE_LIBAIO
if (aio && dio && !buffered) {
ioc->pending_aios.push_back(aio_t(ioc, fd_direct));
++ioc->num_pending;
#要写入的aio
aio_t& aio = ioc->pending_aios.back();
aio.bl.claim_append(bl);
写入buffer中
aio.pwritev(off, len);
}
} else
#endif
{
#如果没有定义libaio的宏,则通过_sync_write ->pwritev的方式来写入
int r = _sync_write(off, bl, buffered);
_aio_log_finish(ioc, off, len);
if (r < 0)
return r;
}
return 0;
}
通过aio_write 写入buffer中后,会调用aio_submit 来提交io请求
void KernelDevice::aio_submit(IOContext *ioc)
{
void *priv = static_cast<void*>(ioc);
int r, retries = 0;
这里会批量提交aio
r = aio_queue.submit_batch(ioc->running_aios.begin(), e,
pending, priv, &retries);
}
int aio_queue_t::submit_batch(aio_iter begin, aio_iter end,
uint16_t aios_size, void *priv,
int *retries)
{
assert(aios_size >= left);
int done = 0;
while (left > 0) {
#这里通过whilie循环来批量提交aio
int r = io_submit(ctx, left, piocb + done);
if (r < 0) {
}
}
aio 提价后再kerneldevice的构造函数中新建一个thread来专门检查io的完成情况
KernelDevice::KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv)
: aio_thread(this), //等待aio完成的thread
injecting_crash(0)
{
}
这个等待aio完成的thread的实现如下:
void KernelDevice::_aio_thread()
{
while (!aio_stop) {
dout(40) << __func__ << " polling" << dendl;
int max = cct->_conf->bdev_aio_reap_max;
aio_t *aio[max];
#这里调用aio提供的函数来等待aio完成
int r = aio_queue.get_next_completed(cct->_conf->bdev_aio_poll_ms,
aio, max);
if (r < 0) {
derr << __func__ << " got " << cpp_strerror(r) << dendl;
assert(0 == "got unexpected error from io_getevents");
}
}
具体的等待函数如下:可以看出是通过io_getevents 来完成等待的
int aio_queue_t::get_next_completed(int timeout_ms, aio_t **paio, int max)
{
int r = 0;
do {
r = io_getevents(ctx, 1, max, event, &t);
} while (r == -EINTR);
}
bluestore用到的块设备
最新推荐文章于 2024-01-09 17:07:31 发布