ObjectStore api
ObjectStore 是 BlueStore 的父类,通过研究 ObjectStore 的文档,可以清楚 BlueStore 对外提供了哪些功能。
大部分功能都是向 OSD 提供,在 /src/osd/OSD.cc 中被调用。
create
创建一个 ObjectStore 实例。只会在初始化时调用一次。
/**
* create - create an ObjectStore instance.
*
* This is invoked once at initialization time.
*
* @param type 存储引擎类型(BLueStore,FileStore)
* @param data OSD 数据路径,如/var/lib/ceph/osd0
* @param journal 日志路径,FileStore 需要提供,BlueStore 不需要提供
* @param flags which filestores should check if applicable,BlueStore 不需要提供
* @return ObjectStore 指针
*/
static ObjectStore *create(CephContext *cct,
const std::string& type,
const std::string& data,
const std::string& journal,
osflagbits_t flags = 0);
使用例子:
auto cct = global_init(...);
common_init_finish(g_ceph_context);
ObjectStore *fs = ObjectStore::create(g_ceph_context, type, dpath, jpath, flags);
probe_block_device_fsid
返回一个 OSD 的 fsid号。即 /var/lib/ceph/osd/fsid 文件中保存的 uuid 序列号
/**
* probe a block device to learn the uuid of the owning OSD
*
* @param cct cct
* @param path OSD 路径
* @param fsid [out] OSD fsid 号
* @return 0 for success, other for failure
*/
static int probe_block_device_fsid(
CephContext *cct,
const std::string& path,
uuid_d *fsid);
get_cur_stats
获取 ObjectStore 状态。
/**
* Fetch Object Store statistics.
*
* 返回延时和响应时间。write latency and apply times
*
* 此调用不会获取锁。即调用的瞬间可能 OS 状态发生改变,但获取的结果已经过时
*
* @param
* @return objectstore_perf_stat_t 实例
*/
virtual objectstore_perf_stat_t get_cur_stats() = 0;
get_perf_counters
获取 OS 中的 perf_counters 对象指针。
/**
* Fetch Object Store performance counters.
*
* This appears to be called with nothing locked.
*
* @param
* @return PerfCounters 指针
*/
virtual const PerfCounters* get_perf_counters() const = 0;
queue_transaction / queue_transactions
提交事务到 OS。
/**
* @param ch 集合句柄,用于获取对应 collection
* @param t 事务,封装一组 op 操作,可以是不同 hobj
* @param op
* @param handle
* @return 0 for success, other for failure
*/
int queue_transaction(CollectionHandle& ch,
Transaction&& t,
TrackedOpRef op = TrackedOpRef(),
ThreadPool::TPHandle *handle = NULL) {
std::vector<Transaction> tls;
tls.push_back(std::move(t));
return queue_transactions(ch, tls, op, handle);
}
virtual int queue_transactions(
CollectionHandle& ch, std::vector<Transaction>& tls,
TrackedOpRef op = TrackedOpRef(),
ThreadPool::TPHandle *handle = NULL) = 0;
upgrade
仅在 FileStore 中使用。
get_db_statistics
打印 OS 的使用情况,可以通过 OSD 使用 dump_objectstore_kv_stats 命令调用。
/**
* @param f 输出流
* @return
*/
virtual void get_db_statistics(ceph::Formatter *f) { }
本人在集群中使用以下命令查看,发现返回结果为空
[root@node-1 ~]# ceph daemon /var/run/ceph/ceph-osd.0.asok dump_objectstore_kv_stats
generate_db_histogram
kvdb 统计
/**
* @param f 输出流
* @return
*/
virtual void generate_db_histogram(ceph::Formatter *f) { }
[root@node-1 ~]# ceph daemon /var/run/ceph/ceph-osd.0.asok calc_objectstore_db_histogram
{
"num_onodes": 660,
"num_shards": 48,
"num_super": 8,
"num_coll": 193,
"num_omap": 15,
"num_pgmeta_omap": 2783,
"num_deferred": 2,
"num_alloc": 58,
"num_stat": 3,
"num_shared_shards": 0,
"num_others": 0,
"max_key_size": 79,
"max_value_size": 41118,
"total_key_size": 128176,
"total_value_size": 674498
}
{
"[0,64)": 1440,
"[64,128)": 356,
"[128,192)": 1630,
"[320,384)": 2,
"[384,448)": 5,
"[448,512)": 7,
"[512,576)": 128,
"[576,640)": 1,
"[640,704)": 1,
"[832,896)": 1,
"[896,960)": 193,
"[1088,1152)": 2,
"[1152,1216)": 1,
"[1408,1472)": 1,
"[16448,16512)": 1,
"[41088,41152)": 1
}
{
"prefix": "B",
"key_hist": {
"[0,32)": 58,
"max_len": 17,
"value_hist": {
"[0,64)": 58,
"max_len": 16
}
},
"prefix": "C",
"key_hist": {
"[0,32)": 193,
"max_len": 11,
"value_hist": {
"[0,64)": 193,
"max_len": 10
}
},
"prefix": "L",
"key_hist": {
"[0,32)": 2,
"max_len": 10,
"value_hist": {
"[16448,16512)": 1,
"max_len": 16474,
"[41088,41152)": 1,
"max_len": 41118
}
},
"prefix": "M",
"key_hist": {
"[0,32)": 14,
"max_len": 27,
"value_hist": {
"[0,64)": 14,
"max_len": 26
},
"[32,64)": 1,
"max_len": 32,
"value_hist": {
"[0,64)": 1,
"max_len": 17
}
},
"prefix": "P",
"key_hist": {
"[0,32)": 1046,
"max_len": 26,
"value_hist": {
"[0,64)": 769,
"max_len": 33,
"[128,192)": 85,
"max_len": 186,
"[896,960)": 192,
"max_len": 948
},
"[32,64)": 1737,
"max_len": 42,
"value_hist": {
"[0,64)": 193,
"max_len": 12,
"[128,192)": 1544,
"max_len": 187
}
},
"prefix": "S",
"key_hist": {
"[0,32)": 8,
"max_len": 26,
"value_hist": {
"[0,64)": 8,
"max_len": 20
}
},
"prefix": "T",
"key_hist": {
"[0,32)": 3,
"max_len": 10,
"value_hist": {
"[0,64)": 3,
"max_len": 40
}
},
"prefix": "o",
"key_hist": {
"[32,64)": 554,
"max_len": 59,
"value_hist": {
"[0,64)": 193,
"max_len": 30,
"[64,128)": 356,
"max_len": 111,
"[320,384)": 1,
"max_len": 357,
"[384,448)": 3,
"max_len": 445,
"[576,640)": 1,
"max_len": 611
},
"[64,96)": 106,
"max_len": 74,
"value_hist": {
"[384,448)": 2,
"max_len": 422,
"[448,512)": 6,
"max_len": 473,
"[512,576)": 94,
"max_len": 517,
"[640,704)": 1,
"max_len": 660,
"[832,896)": 1,
"max_len": 855,
"[896,960)": 1,
"max_len": 939,
"[1408,1472)": 1,
"max_len": 1427
}
},
"prefix": "x",
"key_hist": {
"[64,96)": 48,
"max_len": 79,
"value_hist": {
"[0,64)": 8,
"max_len": 9,
"[128,192)": 1,
"max_len": 147,
"[320,384)": 1,
"max_len": 339,
"[448,512)": 1,
"max_len": 483,
"[512,576)": 34,
"max_len": 547,
"[1088,1152)": 2,
"max_len": 1098,
"[1152,1216)": 1,
"max_len": 1166
}
}
}
flush_cache
清空 onode 和 buffer 缓存,因为缓存从磁盘读取,所以也不需要刷新到磁盘,直接清空缓存即可完成动作。
/**
* @param os 输出流
* @return 0 for success, other for failure
*/
virtual int flush_cache(std::ostream *os = NULL) { return -1; }
[root@node-1 ~]# ceph daemon /var/run/ceph/ceph-osd.0.asok flush_store_cache
dump_perf_counters
/**
* @param f, os 输出流
* @return
*/
virtual void dump_perf_counters(ceph::Formatter *f) {}
virtual void dump_cache_stats(std::ostream& os) {}
dump_cache_stats
/**
* @param f 输出流
* @return
*/
virtual void dump_cache_stats(ceph::Formatter *f) {}
[root@node-1 ~]# ceph daemon /var/run/ceph/ceph-osd.0.asok cache status
get_type
返回 OS 类型,如 bluestore。
/**
* @param
* @return string 类型字符串
*/
virtual std::string get_type() = 0;
test_mount_in_use
mount 前的预测试,在 OSD::pre_init()中被调用,如若发生错误,则说明 ObjectStore::mount() 不可用。
/**
* @param
* @return true for success, false for failure
*/
virtual bool test_mount_in_use() = 0;
mount / umount
挂载/卸载 OS。在 mkfs() 之后使用。
/**
* @param
* @return 0 for success
*/
virtual int mount() = 0;
virtual int umount() = 0;
fsck
对 OS 进行检查或者修复。
/**
* @param deep true for FSCK_DEEP, false for FSCK_REGULAR
* @return 0 for success, other for false
*/
virtual int fsck(bool deep)
set_cache_shard
开启缓存。BlueStore 支持自己管理 onode 和 buffer 缓存。此函数用于创建缓存实例,num 是创建的总实例数量。在《Ceph之rados设计原理与实现》p101 页介绍了每个 BlueStore 包含多个 Cache 实例,每个 OSD 相应地会设置多个 PG 工作队列,BlueStore 中的 Cache 实例个数与之对应。
此函数在 BlueStore create() 时被调用。
/**
* @param num cache 实例总数
* @return
*/
virtual void set_cache_shards(unsigned num)
validate_hobject_key
BlueStore中支持任何长度的 name,所以此函数在 BLueStore 中始终返回0。
/**
* @param obj hobject 引用
* @return 0 for valid, other for invalid
*/
virtual int validate_hobject_key(const hobject_t &obj) const = 0;
get_max_attr_name_length
BlueStore 内部对 xattr name 的长度也没有真正限制。这里返回 256。
/**
* @param
* @return unsigned xattr name 最大长度,默认256
*/
virtual unsigned get_max_attr_name_length() = 0;
mkfs
OS 格式化。在 create() 之后使用。内部提供检测机制,支持对一个 OSD 目录多次调用 mkfs()。
/**
* @param
* @return 0 for success, other for failure
*/
virtual int mkfs() = 0;
mkjournal | needs_journal | wants_journal | allows_journal
BlueStore 不支持。
virtual int mkjournal() = 0; // journal only
virtual bool needs_journal() = 0; //< requires a journal
virtual bool wants_journal() = 0; //< prefers a journal
virtual bool allows_journal() = 0; //< allows a journa
get_min_alloc_size
返回最小分配空间。默认 4KB。支持配置文件修改:bluestore_min_alloc_size,bluestore_min_alloc_size_hdd,bluestore_min_alloc_size_ssd。
/**
* @param
* @return uint64_t 最小分配空间的字节长度
*/
virtual uint64_t get_min_alloc_size() const
get_device
枚举所有磁盘设备。
/**
* @param devls 记录所有磁盘设备的位置
* @return 0 for success, other for failure
*/
virtual int get_devices(std::set<std::string> *devls)
下述命令可以直接调用 get_device 函数获取磁盘设备。
[root@node-1 ~]# ceph daemon /var/run/ceph/ceph-osd.0.asok list_devices
{
"device": "/dev/sdb"
}
is_sync_onreadable
BlueStore 不支持。
/**
* @param
* @return
*/
virtual bool is_sync_onreadable() const
is_rotational
验证 SLOW 设备是 HDD 还是 SSD,ture 为 HDD,false 为 SSD。
/**
* @param
* @return true for HDD, false for SSD
*/
virtual bool is_rotational()
is_journal_rotational
BlueStore 中检查 WAL 设备是 HDD 还是 SSD,true 为 HDD,false 为 SSD。
/**
* @param
* @return true for HDD, false for SSD
*/
virtual bool is_journal_rotational()
get_default_device_class
调用 is_rotational() 函数,查询设备的类别:HDD 或者 SSD
/**
* @param
* @return string hdd or ssd
*/
virtual std::string get_default_device_class()
get_numa_node
暂不可用
virtual int get_numa_node(
int *numa_node,
std::set<int> *nodes,
std::set<std::string> *failed)
can_sort_nibblewise
BlueStore 不支持
virtual bool can_sort_nibblewise()
statfs
查询 OS 的文件系统信息:
uint64_t total = 0; ///< Total bytes
uint64_t available = 0; ///< Free bytes available
uint64_t internally_reserved = 0; ///< Bytes reserved for internal purposes
int64_t allocated = 0; ///< Bytes allocated by the store
int64_t data_stored = 0; ///< Bytes actually stored by the user
int64_t data_compressed = 0; ///< Bytes stored after compression
int64_t data_compressed_allocated = 0; ///< Bytes allocated for compressed data
int64_t data_compressed_original = 0; ///< Bytes that were compressed
int64_t omap_allocated = 0; ///< approx usage of omap data
int64_t internal_metadata = 0; ///< approx usage of internal metadata
virtual int statfs(struct store_statfs_t *buf,
osd_alert_list_t* alerts = nullptr)
可以使用 ceph-objectstore-tool 工具查看 statfs 信息
[root@localhost bin]# ./ceph-objectstore-tool --data-path /root/ceph/build/dev/osd0/ --op statfs --no-mon-config
{
"total": 108447916032,
"available": 107373961216,
"internally_reserved": 0,
"allocated": 212992,
"data_stored": 62544,
"data_compressed": 0,
"data_compressed_allocated": 0,
"data_compressed_original": 0,
"omap_allocated": 972,
"internal_metadata": 22019124
}
pool_statfs
获取池文件系统信息
uint64_t total = 0; ///< Total bytes
uint64_t available = 0; ///< Free bytes available
uint64_t internally_reserved = 0; ///< Bytes reserved for internal purposes
int64_t allocated = 0; ///< Bytes allocated by the store
int64_t data_stored = 0; ///< Bytes actually stored by the user
int64_t data_compressed = 0; ///< Bytes stored after compression
int64_t data_compressed_allocated = 0; ///< Bytes allocated for compressed data
int64_t data_compressed_original = 0; ///< Bytes that were compressed
int64_t omap_allocated = 0; ///< approx usage of omap data
int64_t internal_metadata = 0; ///< approx usage of internal metadata
virtual int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf, bool *per_pool_omap) = 0;
collect_metadata
获取的信息有:
bluefs
bluefs_single_shared_device
bluefs_dedicated_db
bluefs_dedicated_wal
objectstore_numa_unknown_devices
objectstore_numa_nodes
virtual void collect_metadata(std::map<std::string,std::string> *pm)
write_meta
BlueStore 中向块设备的超级块写入元数据,以键值对形式。
注:BlueStore 的超级块是 SLOW 设备的第一个 4KB 块。BlueFS 的超级块是 DB 设备(当 DB 不存在时,使用 SLOW设备)的第二个 4KB 块。
/**
* write_meta - write a simple configuration key out-of-band
*
* Write a simple key/value pair for basic store configuration
* (e.g., a uuid or magic number) to an unopened/unmounted store.
* The default implementation writes this to a plaintext file in the
* path.
*
* A newline is appended.
*
* @param key key name (e.g., "fsid")
* @param value value (e.g., a uuid rendered as a std::string)
* @returns 0 for success, or an error code
*/
virtual int write_meta(const std::string& key,
const std::string& value);
可以使用多种工具查看,这里给出 ceph-objectstore-tool 工具的查看命令:
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ --op dump-super
{
"cluster_fsid": "60e065f1-d992-4d1a-8f4e-f74419674f7e",
"osd_fsid": "9912f587-6c2c-4098-8635-b97fd46f721e",
"whoami": 0,
"current_epoch": 156,
"oldest_map": 1,
"newest_map": 156,
"weight": 0,
"compat": {
"compat": {},
"ro_compat": {},
"incompat": {
"feature_1": "initial feature set(~v.18)",
"feature_2": "pginfo object",
"feature_3": "object locator",
"feature_4": "last_epoch_clean",
"feature_5": "categories",
"feature_6": "hobjectpool",
"feature_7": "biginfo",
"feature_8": "leveldbinfo",
"feature_9": "leveldblog",
"feature_10": "snapmapper",
"feature_11": "sharded objects",
"feature_12": "transaction hints",
"feature_13": "pg meta object",
"feature_14": "explicit missing set",
"feature_15": "fastinfo pg attr",
"feature_16": "deletes in missing set"
}
},
"clean_thru": 156,
"last_epoch_mounted": 154
}
read_meta
读取超级块信息
/**
* read_meta - read a simple configuration key out-of-band
*
* Read a simple key value to an unopened/mounted store.
*
* Trailing whitespace is stripped off.
*
* @param key key name
* @param value pointer to value std::string
* @returns 0 for success, or an error code
*/
virtual int read_meta(const std::string& key,
std::string *value);
open_collection
获取 collection。BlueStore 会查询 kvdb 中前缀为 C 的所有 kv 键值对,找到 key 值匹配的集合并返回,若未在 kvdb 命中,也会返回一个 colleciton 指针。
/**
* get a collection handle
*
* Provide a trivial handle as a default to avoid converting legacy
* implementations.
*
* @param cid 集合id、类型的包装
* @return CollectionHandle 集合句柄
*/
virtual CollectionHandle open_collection(const coll_t &cid) = 0;
create_new_collection
创建一个集合,实际就是把集合写入 kvdb,此操作需要通过 queue_transaction() 才能生效。
/**
* get a collection handle for a soon-to-be-created collection
*
* This handle must be used by queue_transaction that includes a
* create_collection call in order to become valid. It will become the
* reference to the created collection.
*
* @param cid 集合id
* @return ColletionHandle 集合句柄
*/
virtual CollectionHandle create_new_collection(const coll_t &cid) = 0;
set_collection_commit_queue
为 collection 设置一个 on_commit 回调函数队列,每个 collection 只有一个该队列。在 /src/osd/OSD.cc 中被调用,只在新建集合、载入集合、分裂集合时才会创建。
/**
* std::set ContextQueue for a collection
*
* After that, oncommits of Transaction will queue into commit_queue.
* And osd ShardThread will call oncommits.
*
* @param cid 集合id
* @param commit_queue on_commit 回调函数队列实例
* @return
*/
virtual void set_collection_commit_queue(const coll_t &cid, ContextQueue *commit_queue) = 0;
exist
判断集合中是否存在该对象。
/**
* exists -- Test for existance of object
*
* @param cid collection for object
* @param oid oid of object
* @returns true if object exists, false otherwise
*/
virtual bool exists(CollectionHandle& c, const ghobject_t& oid) = 0;
set_collection_opts
设置存储池 pool 选项。在 src/osd/PG.cc 中被调用。
具体选项:
SCRUB_MIN_INTERVAL,
SCRUB_MAX_INTERVAL,
DEEP_SCRUB_INTERVAL,
RECOVERY_PRIORITY,
RECOVERY_OP_PRIORITY,
SCRUB_PRIORITY,
COMPRESSION_MODE,
COMPRESSION_ALGORITHM,
COMPRESSION_REQUIRED_RATIO,
COMPRESSION_MAX_BLOB_SIZE,
COMPRESSION_MIN_BLOB_SIZE,
CSUM_TYPE,
CSUM_MAX_BLOCK,
CSUM_MIN_BLOCK,
FINGERPRINT_ALGORITHM,
PG_NUM_MIN, // min pg_num
TARGET_SIZE_BYTES, // total bytes in pool
TARGET_SIZE_RATIO, // fraction of total cluster
PG_AUTOSCALE_BIAS,
READ_LEASE_INTERVAL,
DEDUP_TIER,
DEDUP_CHUNK_ALGORITHM,
DEDUP_CDC_CHUNK_SIZE,
/**
* set_collection_opts -- std::set pool options for a collectioninformation for an object
*
* @param cid collection
* @param opts new collection options
* @returns 0 on success, negative error code on failure.
*/
virtual int set_collection_opts(
CollectionHandle& c,
const pool_opts_t& opts) = 0;
stat
获取对象文件属性信息。stat 信息并非全部填写,而是只获取部分。
具体有:
st->st_size = o->onode.size;
st->st_blksize = 4096;
st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
st->st_nlink = 1;
/**
* stat -- get information for an object
*
* @param cid collection for object
* @param oid oid of object
* @param st output information for the object
* @param allow_eio if false, assert on -EIO operation failure
* @returns 0 on success, negative error code on failure.
*/
virtual int stat(
CollectionHandle &c,
const ghobject_t& oid,
struct stat *st,
bool allow_eio = false) = 0;
read
读取对象数据,可以设置 offset 和 length,默认为读取整个对象。
/**
* read -- read a byte range of data from an object
*
* Note: if reading from an offset past the end of the object, we
* return 0 (not, say, -EINVAL).
*
* @param cid collection for object
* @param oid oid of object
* @param offset location offset of first byte to be read
* @param len number of bytes to be read
* @param bl output ceph::buffer::list
* @param op_flags is CEPH_OSD_OP_FLAG_*
* @returns number of bytes read on success, or negative error code on failure.
*/
virtual int read(
CollectionHandle &c,
const ghobject_t& oid,
uint64_t offset,
size_t len,
ceph::buffer::list& bl,
uint32_t op_flags = 0) = 0;
fiemap
分段加载 extent_map,为了支持 readv() 函数。
此函数作用是把 object 对应范围的 extent_map 读取到内存中。返回一个 set,保存了读取的范围。
extent_map 在 BlueStore 中是分片保存在磁盘上,因此需要读取对应分片。此函数可以把一定范围的对象转为 分片范围,读取到内存中。
注意:返回的只是读取范围 [start, end],并不是 extent_map 内容。
/**
* fiemap -- get extent std::map of data of an object
*
* Returns an encoded std::map of the extents of an object's data portion
* (std::map<offset,size>).
*
* A non-enlightened implementation is free to return the extent (offset, len)
* as the sole extent.
*
* @param cid collection for object
* @param oid oid of object
* @param offset location offset of first byte to be read
* @param len number of bytes to be read
* @param bl output ceph::buffer::list for extent std::map information.
* @returns 0 on success, negative error code on failure.
*/
virtual int fiemap(CollectionHandle& c, const ghobject_t& oid,
uint64_t offset, size_t len, ceph::buffer::list& bl) = 0;
virtual int fiemap(CollectionHandle& c, const ghobject_t& oid,
uint64_t offset, size_t len, std::map<uint64_t, uint64_t>& destmap) = 0;
readv
同 read() 类似。区别在于 read() 只能读取一段数据,readv() 支持读取多段数据。
/**
* readv -- read specfic intervals from an object;
* caller must call fiemap to fill in the extent-map first.
*
* Note: if reading from an offset past the end of the object, we
* return 0 (not, say, -EINVAL). Also the default version of readv
* reads each extent separately synchronously, which can become horribly
* inefficient if the physical layout of the pushing object get massively
* fragmented and hence should be overridden by any real os that
* cares about the performance..
*
* @param cid collection for object
* @param oid oid of object
* @param m intervals to be read
* @param bl output ceph::buffer::list
* @param op_flags is CEPH_OSD_OP_FLAG_*
* @returns number of bytes read on success, or negative error code on failure.
*/
virtual int readv(
CollectionHandle &c,
const ghobject_t& oid,
interval_set<uint64_t>& m,
ceph::buffer::list& bl,
uint32_t op_flags = 0)
dump_onode
目前仅在 ceph-objectstore-tool 中被调用到。
/**
* dump_onode -- dumps onode metadata in human readable form,
intended primiarily for debugging
*
* @param cid collection for object
* @param oid oid of object
* @param section_name section name to create and print under
* @param f Formatter class instance to print to
* @returns 0 on success, negative error code on failure.
*/
virtual int dump_onode(
CollectionHandle &c,
const ghobject_t& oid,
const std::string& section_name,
ceph::Formatter *f) {
return -ENOTSUP;
}
给出 ceph-objectstore-tool 使用范例:
[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ rbd_header.20e5ff0224ec0 dump
{
"id": {
"oid": "rbd_header.20e5ff0224ec0",
"key": "",
"snapid": -2,
"hash": 1624672572,
"max": 0,
"pool": 2,
"namespace": "",
"max": 0
},
"info": {
"oid": {
"oid": "rbd_header.20e5ff0224ec0",
"key": "",
"snapid": -2,
"hash": 1624672572,
"max": 0,
"pool": 2,
"namespace": ""
},
"version": "137'29",
"prior_version": "137'28",
"last_reqid": "osd.1.0:2",
"user_version": 27,
"size": 0,
"mtime": "2021-05-27 09:33:24.367195",
"local_mtime": "2021-05-27 09:33:24.422271",
"lost": 0,
"flags": [
"dirty",
"omap",
"data_digest",
"omap_digest"
],
"truncate_seq": 0,
"truncate_size": 0,
"data_digest": "0xffffffff",
"omap_digest": "0x4bbef111",
"expected_object_size": 0,
"expected_write_size": 0,
"alloc_hint_flags": 0,
"manifest": {
"type": 0
},
"watchers": {}
},
"stat": {
"size": 0,
"blksize": 4096,
"blocks": 0,
"nlink": 1
},
"SnapSet": {
"snap_context": {
"seq": 0,
"snaps": []
},
"clones": []
}
}
getattr
查询对象的 xattr 属性。通过 kvdb 获取 onode 信息,xattr 保存在每个对象的 onode 中。
支持多种返回类型:ptr、buffer、map<string, ptr>、map<string, list>。
/**
* getattr -- get an xattr of an object
*
* @param cid collection for object
* @param oid oid of object
* @param name name of attr to read
* @param value place to put output result.
* @returns 0 on success, negative error code on failure.
*/
virtual int getattr(CollectionHandle &c, const ghobject_t& oid,
const char *name, ceph::buffer::ptr& value) = 0;
/**
* getattr -- get an xattr of an object
*
* @param cid collection for object
* @param oid oid of object
* @param name name of attr to read
* @param value place to put output result.
* @returns 0 on success, negative error code on failure.
*/
int getattr(
CollectionHandle &c, const ghobject_t& oid,
const std::string& name, ceph::buffer::list& value) {
ceph::buffer::ptr bp;
int r = getattr(c, oid, name.c_str(), bp);
value.push_back(bp);
return r;
}
/**
* getattrs -- get all of the xattrs of an object
*
* @param cid collection for object
* @param oid oid of object
* @param aset place to put output result.
* @returns 0 on success, negative error code on failure.
*/
virtual int getattrs(CollectionHandle &c, const ghobject_t& oid,
std::map<std::string,ceph::buffer::ptr>& aset) = 0;
/**
* getattrs -- get all of the xattrs of an object
*
* @param cid collection for object
* @param oid oid of object
* @param aset place to put output result.
* @returns 0 on success, negative error code on failure.
*/
int getattrs(CollectionHandle &c, const ghobject_t& oid,
std::map<std::string,ceph::buffer::list>& aset) {
std::map<std::string,ceph::buffer::ptr> bmap;
int r = getattrs(c, oid, bmap);
for (auto i = bmap.begin(); i != bmap.end(); ++i) {
aset[i->first].append(i->second);
}
return r;
}
list_collection
查询此 OSD 的所有集合。
/**
* list_collections -- get all of the collections known to this ObjectStore
*
* @param ls std::list of the collections in sorted order.
* @returns 0 on success, negative error code on failure.
*/
virtual int list_collections(std::vector<coll_t>& ls) = 0;
collection_exists
检查 OSD 中是否有该集合。
/**
* does a collection exist?
*
* @param c collection
* @returns true if it exists, false otherwise
*/
virtual bool collection_exists(const coll_t& c) = 0;
collection_empty
检查集合是否为空(没有对象)?
/**
* is a collection empty?
*
* @param c collection
* @param empty true if the specified collection is empty, false otherwise
* @returns 0 on success, negative error code on failure.
*/
virtual int collection_empty(CollectionHandle& c, bool *empty) = 0;
collection_bits
对象在进行 crush 运算映射到某个 pg 时,因为 pg 的数量总是有限的,因此不需要对整个对象 id 进行 hash 映射,只需要取最后的 n 位(2^n = pg 数量),即 n 位表示对象通过 stable_mod 映射至 pg 时,其32位全精度哈希值(从最低位开始)有多少位是有效的。这一概念在《Ceph之Rados设计原理与实现》一书中P11 ~ P13页有详细介绍。
/**
* return the number of significant bits of the coll_t::pgid.
*
* This should return what the last create_collection or split_collection
* std::set. A legacy backend may return -EAGAIN if the value is unavailable
* (because we upgraded from an older version, e.g., FileStore).
*/
virtual int collection_bits(CollectionHandle& c) = 0;
collection_list
列出集合中指定范围的对象。
/**
* std::list contents of a collection that fall in the range [start, end) and no more than a specified many result
*
* @param c collection
* @param start list object that sort >= this value
* @param end list objects that sort < this value
* @param max return no more than this many results
* @param seq return no objects with snap < seq
* @param ls [out] result
* @param next [out] next item sorts >= this value
* @return zero on success, or negative error
*/
virtual int collection_list(CollectionHandle &c,
const ghobject_t& start, const ghobject_t& end,
int max,
std::vector<ghobject_t> *ls, ghobject_t *next) = 0;
virtual int collection_list_legacy(CollectionHandle &c,
const ghobject_t& start,
const ghobject_t& end, int max,
std::vector<ghobject_t> *ls,
ghobject_t *next) {
return collection_list(c, start, end, max, ls, next);
}
omap_get
查询指定对象的 omap 属性。
omap 在kvdb 的保存形式为:prefix : key :value。每个对象的 omap 单独保存在 kvdb 中,前缀为 M。
virtual int omap_get(
CollectionHandle &c, ///< [in] Collection containing oid
const ghobject_t &oid, ///< [in] Object containing omap
ceph::buffer::list *header, ///< [out] omap header
std::map<std::string, ceph::buffer::list> *out /// < [out] Key to value std::map
) = 0;
omap_get_header
获取指定对象的 omap_header(每个对象只有一个)。每个对象的 omap_header 单独保存在 kvdb 中,前缀为 M.
virtual int omap_get_header(
CollectionHandle &c, ///< [in] Collection containing oid
const ghobject_t &oid, ///< [in] Object containing omap
ceph::buffer::list *header, ///< [out] omap header
bool allow_eio = false ///< [in] don't assert on eio
) = 0;
omap_get_keys
计算对象 omap 在 kvdb 中的 key 值。
virtual int omap_get_keys(
CollectionHandle &c, ///< [in] Collection containing oid
const ghobject_t &oid, ///< [in] Object containing omap
std::set<std::string> *keys ///< [out] Keys defined on oid
) = 0;
omap_get_values
获取指定对象的某个 key 对应的 value。
virtual int omap_get_values(
CollectionHandle &c, ///< [in] Collection containing oid
const ghobject_t &oid, ///< [in] Object containing omap
const std::set<std::string> &keys, ///< [in] Keys to get
std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
) = 0;
omap_check_keys
检查对象的 omap 在 kvdb 的键值对中是否有指定 key(即检查该 key 是否存在于 kvdb 中)。支持批量检查。
/// Filters keys into out which are defined on oid
int omap_check_keys(
CollectionHandle &c, ///< [in] Collection containing oid
const ghobject_t &oid, ///< [in] Object containing omap
const std::set <std::string> &keys, ///< [in] Keys to check
std::set <std::string> *out ///< [out] Subset of keys defined on oid
)
get_omap_iterator
指定一个前缀,返回该前缀的 kvdb 迭代器。通过此迭代器,可以方便的访问 kvdb 内部数据。
/**
* Returns an object map iterator
*
* Warning! The returned iterator is an implicit lock on filestore
* operations in c. Do not use filestore methods on c while the returned
* iterator is live. (Filling in a transaction is no problem).
*
* @return iterator, null on error
*/
virtual ObjectMap::ObjectMapIterator get_omap_iterator(
CollectionHandle &c, ///< [in] collection
const ghobject_t &oid ///< [in] object
) = 0;
Ceph 中一般使用 RocksDB,并进行了二次封装。Ceph 封装后的 kvdb 迭代器提供了以下功能:
seek_to_first():位置调整到第一个key,如果该前缀对应的内容不为空,则 iterator.valid() 为 true,否则为false。
seek_to_last():位置调整到最后一个key。如果该前缀的内容为空,则 iterator.valid() 为 false,否则为 true。
upper_bound(const string &after):查找 key,位置调整到 after 的下一个位置。
lower_bound(const string &to):查找 key,位置调整到 to 的位置。
next():位置向后调整一位。
prev():位置向前调整一位。
valid():此位置时候有效。无效说明不存在,或者越过边界。
key():获取迭代器位置对应的 key。
raw_key():获取迭代器位置对应的 (prefix + key)。
value():获取迭代器位置的 value,返回一个 bufferlist。
value_as_ptr():获取迭代器位置的 value,返回一个 buffer_ptr。
status():迭代器位置是否有效,无效说明不存在,或者越过边界。
flush_journal
BlueStore 不支持。
virtual int flush_journal() { return -EOPNOTSUPP; }
dump_journal
BlueStore 不支持。
virtual int dump_journal(std::ostream& out) { return -EOPNOTSUPP; }
snapshot
BlueStore 不支持。
virtual int snapshot(const std::string& name) { return -EOPNOTSUPP; }
set_fsid
设置 OSD 的 fsid。
注:此修改仅影响正在运行的 OSD 进程,不修改 /var/lib/ceph/osd/fsid 文件保存的内容。
/**
* Set and get internal fsid for this instance. No external data is modified
*/
virtual void set_fsid(uuid_d u) = 0;
get_fsid
获取 OSD 进程中 fsid。
estimate_objects_overhead
评估对象需要使用多少的额外空间(除了其自身的 data 数据)。
计算方式:额外空间 = 对象数量 * 300 byte
/**
* Estimates additional disk space used by the specified amount of objects and caused by file allocation granularity and metadata store
* - num objects - total (including witeouts) object count to measure used space for.
*/
virtual uint64_t estimate_objects_overhead(uint64_t num_objects) = 0;
inject_data_error
virtual void inject_data_error(const ghobject_t &oid) {}
inject_mdata_error
virtual void inject_mdata_error(const ghobject_t &oid) {}
compact
压缩 kvdb 空间。RocksDB 使用 apend 追加写方式记录数据,会产生大量的重复数据,通过此函数可以压缩空间。
virtual void compact() {}
has_builtin_csum
BlueStore 支持数据校验,返回值始终为 true。
virtual bool has_builtin_csum() const
Transaction api
Objectstore 中提供了 queue_transactions() 函数,此函数的作用是把 op 事务提交到存储引擎中,所有涉及数据改写操作都需要使用 Transaction。因此,了解 Transaction 的功能才能正真体会 ObjectStore 对外提供了哪些能力。
register_on_applied_sync
注册 on_applied 回调函数。在事务应用后同步执行。具体时机在 queue_transaction() 完成末尾调用。
on_applied:事务应用完成后的回调函数,即事务已经在存储引擎中生效,但此时数据并不一定已经落盘。
/**
* @param c 回调函数
* @return
*/
void register_on_applied_sync(Context *c)
register_on_applied
注册 on_applied 异步执行的回调函数。在事务应用后异步执行,由 finisher 线程处理。
/**
* @param c 回调函数
* @return
*/
void register_on_applied(Context *c)
register_on_commit
注册 on_commit 异步执行的回调函数。在数据写入 WAL 盘后执行,由 finisher 线程处理。
on_commit:事务提交后的回调函数。数据虽然没有写入数据盘,但是已经写入日志盘。
/**
* @param c 回调函数
* @return
*/
void register_on_commit(Context *c)
register_on_complete
同时注册 on_applied 和 on_commit 两种回调函数。
/**
* @param c 回调函数
* @return
*/
void register_on_complete(Context *c)
has_contexts
检查是否有 on_commit、on_applied 或者 on_applied_sync 回调函数,若有任意一个存在,则返回 true,否则 false。
/**
* @param
* @return true for exist, other for none
*/
bool has_contexts() const
collect_contexts
解析出 transaction 中的三种回调函数。
/**
* @param t 事务,其中保存了回调函数
* @param out_on_applied 用于存放 on_applied
* @param out_on_commit 用于存放 on_commit
* @param out_on_applied_sync 用于存放 on_applied_sync
* @return
*/
static void collect_contexts(
std::vector<Transaction> &t,
Context **out_on_applied,
Context **out_on_commit,
Context **out_on_applied_sync)
static void collect_contexts(
std::vector<Transaction> &t,
std::list<Context *> *out_on_applied,
std::list<Context *> *out_on_commit,
std::list<Context *> *out_on_applied_sync)
注:以下操作都需要通过 queue_transaction() 提交
create
创建一个对象。此时对象 data 为空。
/**
* create
*
* create an object that does not yet exist
* (behavior is undefined if the object already exists)
*
* @param cid 集合id
* @param oid 对象id
* @return
*/
void create(const coll_t &cid, const ghobject_t &oid)
touch
创建一个对象,等同于 create()。此时对象 data 为空。
/**
* touch
*
* Ensure the existance of an object in a collection. Create an
* empty object if necessary
*
* @param cid 集合id
* @param oid 对象id
* @return
*/
void touch(const coll_t &cid, const ghobject_t &oid)
write
向一个对象写入数据。注:可以存在文件空洞。
/**
* Write data to an offset within an object. If the object is too
* small, it is expanded as needed. It is possible to specify an
* offset beyond the current end of an object and it will be
* expanded as needed. Simple implementations of ObjectStore will
* just zero the data between the old end of the object and the
* newly provided data. More sophisticated implementations of
* ObjectStore will omit the untouched data and store it as a
* "hole" in the file.
*
* Note that a 0-length write does not affect the size of the object.
*
* @param cid 集合id
* @param oid 对象id
* @param off 待操作的对象 data 偏移量
* @param len 待操作的对象 data 长度
* @param write_data 写入的 data 数据
* @param flags 标志
* @return
*/
void write(const coll_t &cid, const ghobject_t &oid, uint64_t off, uint64_t len,
const ceph::buffer::list &write_data, uint32_t flags = 0)
zero
归零。
/**
* zero out the indicated byte range within an object. Some
* ObjectStore instances may optimize this to release the
* underlying storage space.
*
* If the zero range extends beyond the end of the object, the object
* size is extended, just as if we were writing a buffer full of zeros.
* EXCEPT if the length is 0, in which case (just like a 0-length write)
* we do not adjust the object size.
*
* @param cid 集合id
* @param oid 对象id
* @param off 待操作的对象 data 偏移量
* @param len 待操作的对象 data 长度
* @return
*/
void zero(const coll_t &cid, const ghobject_t &oid, uint64_t off, uint64_t len)
truncate
截取对象,只保留[0, off]范围的对象。
/** Discard all data in the object beyond the specified size.
*
* @param cid 集合id
* @param oid 对象id
* @param off 超过此长度的全部截掉(删除)。
* @return
*/
void truncate(const coll_t &cid, const ghobject_t &oid, uint64_t off)
remove
删除一个对象。
/**
* Remove an object. All four parts of the object are removed.
*
* @prarm cid 集合id
* @param oid 对象id
* @return
*/
void remove(const coll_t &cid, const ghobject_t &oid) {
Op *_op = _get_next_op();
_op->op = OP_REMOVE;
_op->cid = _get_coll_id(cid);
_op->oid = _get_object_id(oid);
data.ops = data.ops + 1;
}
setattr、setattrs
为指定对象设置一个 xatrr 属性,xattr 属性以键值对的形式保存在对象中,具体位置在 bluestore_onode_t 结构体中。
/**
* @prarm cid 集合id
* @param oid 对象id
* @param name、s 即 xattr 中的 key
* @param val 即 xattr 中的 value
* @param attrset 即 xattr 中 <key, value> 键值对
* @return
*/
/// Set an xattr of an object
void setattr(const coll_t &cid, const ghobject_t &oid, const char *name, ceph::buffer::list &val)
/// Set an xattr of an object
void setattr(const coll_t &cid, const ghobject_t &oid, const std::string &s, ceph::buffer::list &val)
/// Set multiple xattrs of an object
void setattrs(const coll_t &cid, const ghobject_t &oid, const std::map<std::string, ceph::buffer::ptr> &attrset)
/// Set multiple xattrs of an object
void setattrs(const coll_t &cid, const ghobject_t &oid, const std::map<std::string, ceph::buffer::list> &attrset)
rmattr、rmattrs
删除 xattr 属性。rmattr() 是删除指定的单个 xattr 属性, rmattrs() 则是删除一个对象的所有 xattrs 属性。
/**
* @prarm cid 集合id
* @param oid 对象id
* @param name、s 即 xattr 中的 key
* @param val 即 xattr 中的 value
* @return
*/
/// remove an xattr from an object
void rmattr(const coll_t &cid, const ghobject_t &oid, const char *name)
/// remove an xattr from an object
void rmattr(const coll_t &cid, const ghobject_t &oid, const std::string &s)
/// remove all xattrs from an object
void rmattrs(const coll_t &cid, const ghobject_t &oid)