ceph version: Kraken
ObjectStore获取文件系统的fsid。OSD在用户态又构造了一层自己文件系统来管理数据,并为其分配了唯一标识UUID。该UUID是其文件系统元信息中的一员,底层使用的驱动不同其保存的位置也不同,如BlueStore,保存在块设备的第一个块中,FileStore,保存在日志设备中的第一个块中。
获取fsid方法:
int ObjectStore::probe_block_device_fsid(
CephContext *cct,
const string& path,
uuid_d *fsid)
{
int r;
//优先选择bluestore
#if defined(HAVE_LIBAIO)
// first try bluestore -- it has a crc on its header and will fail
// reliably.
r = BlueStore::get_block_device_fsid(cct, path, fsid);
if (r == 0) {
¦ lgeneric_dout(cct, 0) << __func__ << " " << path << " is bluestore, "
<< *fsid << dendl;
¦ return r;
}
#endif
// okay, try FileStore (journal).
r = FileStore::get_block_device_fsid(cct, path, fsid);
if (r == 0) {
¦ lgeneric_dout(cct, 0) << __func__ << " " << path << " is filestore, "
<< *fsid << dendl;
¦ return r;
}
return -EINVAL;
}
BlueStore 获取osd文件系统的OSD uuid,该uuid保存在内存结构的bluestore_bdev_label_t,该结构保存在磁盘的第一个块中。
int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
¦ ¦ ¦uuid_d *fsid)
{
bluestore_bdev_label_t label;
int r = _read_bdev_label(cct, path, &label);
if (r < 0)
¦ return r;
*fsid = label.osd_uuid;
return 0;
}
读取第一个block,反序列化得到label
int BlueStore::_read_bdev_label(CephContext* cct, string path,
bluestore_bdev_label_t *label)
{
dout(10) << __func__ << dendl;
//打开设备
int fd = ::open(path.c_str(), O_RDONLY);
if (fd < 0) {
¦ fd = -errno;
¦ derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
<< dendl;
¦ return fd;
}
bufferlist bl;
//从设备中读取指定大小的数据
int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE); //BDEV_LABEL_BLOCK_SIZE = 4096第一个数据块
VOID_TEMP_FAILURE_RETRY(::close(fd));
if (r < 0) {
¦ derr << __func__ << " failed to read from " << path
<< ": " << cpp_strerror(r) << dendl;
¦ return r;
}
//校验数据的完整性,并将其反序列化
uint32_t crc, expected_crc;
bufferlist::iterator p = bl.begin();
try {
¦ ::decode(*label, p);
¦ bufferlist t;
¦ t.substr_of(bl, 0, p.get_off());
¦ crc = t.crc32c(-1);
¦ ::decode(expected_crc, p);
}
catch (buffer::error& e) {
¦ derr << __func__ << " unable to decode label at offset " << p.get_off()
<< ": " << e.what()
<< dendl;
¦ return -EINVAL;
}
if (crc != expected_crc) {
¦ derr << __func__ << " bad crc on label, expected " << expected_crc
<< " != actual " << crc << dendl;
¦ return -EIO;
}
dout(10) << __func__ << " got " << *label << dendl;
return 0;
}
FileStore 获取osd文件系统的OSD uuid
int FileStore::get_block_device_fsid(CephContext* cct, const string& path,
uuid_d *fsid)
{
// make sure we don't try to use aio or direct_io (and get annoying
// error messages from failing to do so); performance implications
// should be irrelevant for this use
FileJournal j(cct, *fsid, 0, 0, path.c_str(), false, false);
return j.peek_fsid(*fsid);
}
// This can not be used on an active journal
int FileJournal::peek_fsid(uuid_d& fsid)
{
assert(fd == -1);
int r = _open(false, false);
if (r)
¦ return r;
r = read_header(&header);
if (r < 0)
¦ goto out;
fsid = header.fsid;
out:
close();
return r;
}
int FileJournal::_open(bool forwrite, bool create)
{
int flags, ret;
if (forwrite) {
¦ flags = O_RDWR;
¦ if (directio)
¦ ¦ flags |= O_DIRECT | O_DSYNC;
} else {
¦ flags = O_RDONLY;
}
if (create)
¦ flags |= O_CREAT;
if (fd >= 0) {
¦ if (TEMP_FAILURE_RETRY(::close(fd))) {
¦ ¦ int err = errno;
¦ ¦ derr << "FileJournal::_open: error closing old fd: "
¦ ¦<< cpp_strerror(err) << dendl;
¦ }
}
//打开日志设备
fd = TEMP_FAILURE_RETRY(::open(fn.c_str(), flags, 0644));
if (fd < 0) {
¦ int err = errno;
¦ dout(2) << "FileJournal::_open unable to open journal "
¦ ¦ << fn << ": " << cpp_strerror(err) << dendl;
¦ return -err;
}
//获取指定文件的元信息,读取初始化日志文件(或设备)的相关数据(大小,块大小)
struct stat st;
ret = ::fstat(fd, &st);
if (ret) {
¦ ret = errno;
¦ derr << "FileJournal::_open: unable to fstat journal: " << cpp_strerror(ret) << dendl;
¦ ret = -ret;
¦ goto out_fd;
}
//判断是常规文件还是裸块设备
if (S_ISBLK(st.st_mode)) {
¦ ret = _open_block_device();
} else if (S_ISREG(st.st_mode)) {
¦ if (aio && !force_aio) {
¦ ¦ derr << "FileJournal::_open: disabling aio for non-block journal. Use "
¦ ¦<< "journal_force_aio to force use of aio anyway" << dendl;
¦ ¦ aio = false;
¦ }
¦ ret = _open_file(st.st_size, st.st_blksize, create);
} else {
¦ derr << "FileJournal::_open: wrong journal file type: " << st.st_mode
¦<< dendl;
¦ ret = -EINVAL;
}
if (ret)
¦ goto out_fd;
//初始化libaio
#ifdef HAVE_LIBAIO
if (aio) {
¦ aio_ctx = 0;
¦ ret = io_setup(128, &aio_ctx);
¦ if (ret < 0) {
¦ ¦ switch (ret) {
// Contrary to naive expectations -EAGIAN means ...
case -EAGAIN:
¦ derr << "FileJournal::_open: user's limit of aio events exceeded. "
¦ ¦ ¦ ¦<< "Try increasing /proc/sys/fs/aio-max-nr" << dendl;
¦ break;
default:
¦ derr << "FileJournal::_open: unable to setup io_context " << cpp_strerror(-ret) << dendl;
¦ break;
¦ ¦ }
¦ ¦ goto out_fd;
¦ }
}
#endif
/* We really want max_size to be a multiple of block_size. */
max_size -= max_size % block_size;
dout(1) << "_open " << fn << " fd " << fd
¦ << ": " << max_size
¦ << " bytes, block size " << block_size
¦ << " bytes, directio = " << directio
¦ << ", aio = " << aio
¦ << dendl;
return 0;
out_fd:
VOID_TEMP_FAILURE_RETRY(::close(fd));
fd = -1;
return ret;
}
获取块设备的大小
获取块设备大小,检查是否大于最小日志大小要求。
int FileJournal::_open_block_device()
{
int64_t bdev_sz = 0;
int ret = get_block_device_size(fd, &bdev_sz);
if (ret) {
¦ dout(0) << __func__ << ": failed to read block device size." << dendl;
¦ return -EIO;
}
/* Check for bdev_sz too small */
if (bdev_sz < ONE_MEG) {
¦ dout(0) << __func__ << ": your block device must be at least "
¦ ¦ << ONE_MEG << " bytes to be used for a Ceph journal." << dendl;
¦ return -EINVAL;
}
dout(10) << __func__ << ": ignoring osd journal size. "
¦ ¦<< "We'll use the entire block device (size: " << bdev_sz << ")"
¦ ¦<< dendl;
max_size = bdev_sz;
block_size = cct->_conf->journal_block_size;
if (cct->_conf->journal_discard) {
//获取磁盘对discard的支持(/sys/block/sdb/queue/discard_granularity)
¦ discard = block_device_support_discard(fn.c_str());
¦ dout(10) << fn << " support discard: " << (int)discard << dendl;
}
return 0;
}
//获取块设备的大小
int get_block_device_size(int fd, int64_t *psize)
{
#ifdef BLKGETSIZE64
int ret = ::ioctl(fd, BLKGETSIZE64, psize);
#elif defined(BLKGETSIZE)
unsigned long sectors = 0;
int ret = ::ioctl(fd, BLKGETSIZE, §ors);
*psize = sectors * 512ULL;
#else
// cppcheck-suppress preprocessorErrorDirective
# error "Linux configuration error (get_block_device_size)"
#endif
if (ret < 0)
ret = -errno;
return ret;
}
记录OSD日志的是一个文件,会使用该方法来打开该日志文件。
int FileJournal::_open_file(int64_t oldsize, blksize_t blksize,
bool create)
{
int ret;
//配置日志文件的大小
int64_t conf_journal_sz(cct->_conf->osd_journal_size);
conf_journal_sz <<= 20;
if ((cct->_conf->osd_journal_size == 0) && (oldsize < ONE_MEG)) {
¦ derr << "I'm sorry, I don't know how large of a journal to create."
¦<< "Please specify a block device to use as the journal OR "
¦<< "set osd_journal_size in your ceph.conf" << dendl;
¦ return -EINVAL;
}
if (create && (oldsize < conf_journal_sz)) {
¦ uint64_t newsize(conf_journal_sz);
¦ dout(10) << __func__ << " _open extending to " << newsize << " bytes" << dendl;
//扩展日志文件大小,但是该方法只分配了虚拟的空间,即没有实际的数据块
¦ ret = ::ftruncate(fd, newsize);
¦ if (ret < 0) {
¦ ¦ int err = errno;
¦ ¦ derr << "FileJournal::_open_file : unable to extend journal to "
¦ ¦<< newsize << " bytes: " << cpp_strerror(err) << dendl;
¦ ¦ return -err;
¦ }
#ifdef HAVE_POSIX_FALLOCATE
//为文件分配实际的磁盘空间,以防止磁盘空间不足导致写入失败。
¦ ret = ::posix_fallocate(fd, 0, newsize);
¦ if (ret) {
¦ ¦ derr << "FileJournal::_open_file : unable to preallocation journal to "
¦ ¦<< newsize << " bytes: " << cpp_strerror(ret) << dendl;
¦ ¦ return -ret;
¦ }
¦ max_size = newsize;
#elif defined(__APPLE__)
¦ fstore_t store;
¦ store.fst_flags = F_ALLOCATECONTIG;
¦ store.fst_posmode = F_PEOFPOSMODE;
¦ store.fst_offset = 0;
¦ store.fst_length = newsize;
//同上
¦ ret = ::fcntl(fd, F_PREALLOCATE, &store);
¦ if (ret == -1) {
¦ ¦ ret = -errno;
¦ ¦ derr << "FileJournal::_open_file : unable to preallocation journal to "
¦ ¦<< newsize << " bytes: " << cpp_strerror(ret) << dendl;
¦ ¦ return ret;
¦ }
¦ max_size = newsize;
#else
# error "Journal pre-allocation not supported on platform."
#endif
}
else {
¦ max_size = oldsize;
}
block_size = cct->_conf->journal_block_size;
//初始化日志空间,通过填充‘0’
if (create && cct->_conf->journal_zero_on_create) {
¦ derr << "FileJournal::_open_file : zeroing journal" << dendl;
¦ uint64_t write_size = 1 << 20;
¦ char *buf;
//申请一块block_size内存对其的write_size大小的内存空间。
¦ ret = ::posix_memalign((void **)&buf, block_size, write_size);
¦ if (ret != 0) {
¦ ¦ return -ret;
¦ }
¦ memset(static_cast<void*>(buf), 0, write_size);
¦ uint64_t i = 0;
¦ for (; (i + write_size) <= (uint64_t)max_size; i += write_size) {
¦ ¦ ret = ::pwrite(fd, static_cast<void*>(buf), write_size, i);
¦ ¦ if (ret < 0) {
free(buf);
return -errno;
¦ ¦ }
¦ }
¦ if (i < (uint64_t)max_size) {
¦ ¦ ret = ::pwrite(fd, static_cast<void*>(buf), max_size - i, i);
¦ ¦ if (ret < 0) {
free(buf);
return -errno;
¦ ¦ }
¦ }
¦ free(buf);
}
dout(10) << "_open journal is not a block device, NOT checking disk "
¦ ¦ ¦ ¦ ¦<< "write cache on '" << fn << "'" << dendl;
return 0;
}
读取日志的头,该头在日志的第一个块中
int FileJournal::read_header(header_t *hdr) const
{
dout(10) << "read_header" << dendl;
bufferlist bl;
buffer::ptr bp = buffer::create_page_aligned(block_size);
char* bpdata = bp.c_str();
int r = ::pread(fd, bpdata, bp.length(), 0);
if (r < 0) {
¦ int err = errno;
¦ dout(0) << "read_header got " << cpp_strerror(err) << dendl;
¦ return -err;
}
// don't use bp.zero() here, because it also invalidates
// crc cache (which is not yet populated anyway)
if (bp.length() != (size_t)r) {
¦ ¦ // r will be always less or equal than bp.length
¦ ¦ bpdata += r;
¦ ¦ memset(bpdata, 0, bp.length() - r);
}
bl.push_back(std::move(bp));
try {
¦ bufferlist::iterator p = bl.begin();
¦ ::decode(*hdr, p);
}
catch (buffer::error& e) {
¦ derr << "read_header error decoding journal header" << dendl;
¦ return -EINVAL;
}
/*
¦* Unfortunately we weren't initializing the flags field for new
¦* journals! Aie. This is safe(ish) now that we have only one
¦* flag. Probably around when we add the next flag we need to
¦* remove this or else this (eventually old) code will clobber newer
¦* code's flags.
¦*/
if (hdr->flags > 3) {
¦ derr << "read_header appears to have gibberish flags; assuming 0" << dendl;
¦ hdr->flags = 0;
}
print_header(*hdr);
return 0;
}
void FileJournal::print_header(const header_t &header) const
{
dout(10) << "header: block_size " << header.block_size
¦ ¦<< " alignment " << header.alignment
¦ ¦<< " max_size " << header.max_size
¦ ¦<< dendl;
dout(10) << "header: start " << header.start << dendl;
dout(10) << " write_pos " << write_pos << dendl;
}