objectstore and transaction api文档

本文详细介绍了Ceph的ObjectStore API,包括创建、读写对象、事务处理、元数据管理等功能。ObjectStore作为BlueStore的父类,提供了一系列接口供OSD(对象存储守护进程)使用,如创建和销毁对象存储实例、检查和修复OSD、获取和设置性能计数器、管理对象和集合等。此外,还涉及到对象的xattr属性、omap管理和文件系统统计信息等操作。
摘要由CSDN通过智能技术生成

ObjectStore api

ObjectStore 是 BlueStore 的父类,通过研究 ObjectStore 的文档,可以清楚 BlueStore 对外提供了哪些功能。

大部分功能都是向 OSD 提供,在 /src/osd/OSD.cc 中被调用。

create

创建一个 ObjectStore 实例。只会在初始化时调用一次。

  /**
   * create - create an ObjectStore instance.
   *
   * This is invoked once at initialization time.
   *
   * @param type 存储引擎类型(BLueStore,FileStore)
   * @param data OSD 数据路径,如/var/lib/ceph/osd0
   * @param journal 日志路径,FileStore 需要提供,BlueStore 不需要提供
   * @param flags which filestores should check if applicable,BlueStore 不需要提供
   * @return ObjectStore 指针
   */
static ObjectStore *create(CephContext *cct,
			     const std::string& type,
			     const std::string& data,
			     const std::string& journal,
			     osflagbits_t flags = 0);

使用例子:

auto cct = global_init(...);
common_init_finish(g_ceph_context);
ObjectStore *fs = ObjectStore::create(g_ceph_context, type, dpath, jpath, flags);

probe_block_device_fsid

返回一个 OSD 的 fsid号。即 /var/lib/ceph/osd/fsid 文件中保存的 uuid 序列号

  /**
   * probe a block device to learn the uuid of the owning OSD
   *
   * @param cct cct
   * @param path OSD 路径
   * @param fsid [out] OSD fsid 号
   * @return 0 for success, other for failure
   */
  static int probe_block_device_fsid(
    CephContext *cct,
    const std::string& path,
    uuid_d *fsid);

get_cur_stats

获取 ObjectStore 状态。

  /**
   * Fetch Object Store statistics.
   *
   * 返回延时和响应时间。write latency and apply times
   *
   * 此调用不会获取锁。即调用的瞬间可能 OS 状态发生改变,但获取的结果已经过时
   *
   * @param 
   * @return objectstore_perf_stat_t 实例
   */
  virtual objectstore_perf_stat_t get_cur_stats() = 0;

get_perf_counters

获取 OS 中的 perf_counters 对象指针。

  /**
   * Fetch Object Store performance counters.
   *
   * This appears to be called with nothing locked.
   *
   * @param 
   * @return PerfCounters 指针
   */
  virtual const PerfCounters* get_perf_counters() const = 0;

queue_transaction / queue_transactions

提交事务到 OS。

/**
 * @param ch 集合句柄,用于获取对应 collection
 * @param t 事务,封装一组 op 操作,可以是不同 hobj
 * @param op 
 * @param handle 
 * @return 0 for success, other for failure
 */
  int queue_transaction(CollectionHandle& ch,
			Transaction&& t,
			TrackedOpRef op = TrackedOpRef(),
			ThreadPool::TPHandle *handle = NULL) {
    std::vector<Transaction> tls;
    tls.push_back(std::move(t));
    return queue_transactions(ch, tls, op, handle);
  }

  virtual int queue_transactions(
    CollectionHandle& ch, std::vector<Transaction>& tls,
    TrackedOpRef op = TrackedOpRef(),
    ThreadPool::TPHandle *handle = NULL) = 0;

upgrade

仅在 FileStore 中使用。

get_db_statistics

打印 OS 的使用情况,可以通过 OSD 使用 dump_objectstore_kv_stats 命令调用。

/**
 * @param f 输出流
 * @return 
 */
virtual void get_db_statistics(ceph::Formatter *f) { }

本人在集群中使用以下命令查看,发现返回结果为空

[root@node-1 ~]# ceph daemon /var/run/ceph/ceph-osd.0.asok dump_objectstore_kv_stats

generate_db_histogram

kvdb 统计

/**
 * @param f 输出流
 * @return
 */
virtual void generate_db_histogram(ceph::Formatter *f) { }
[root@node-1 ~]# ceph daemon /var/run/ceph/ceph-osd.0.asok calc_objectstore_db_histogram
{
    "num_onodes": 660,
    "num_shards": 48,
    "num_super": 8,
    "num_coll": 193,
    "num_omap": 15,
    "num_pgmeta_omap": 2783,
    "num_deferred": 2,
    "num_alloc": 58,
    "num_stat": 3,
    "num_shared_shards": 0,
    "num_others": 0,
    "max_key_size": 79,
    "max_value_size": 41118,
    "total_key_size": 128176,
    "total_value_size": 674498
}
{
    "[0,64)": 1440,
    "[64,128)": 356,
    "[128,192)": 1630,
    "[320,384)": 2,
    "[384,448)": 5,
    "[448,512)": 7,
    "[512,576)": 128,
    "[576,640)": 1,
    "[640,704)": 1,
    "[832,896)": 1,
    "[896,960)": 193,
    "[1088,1152)": 2,
    "[1152,1216)": 1,
    "[1408,1472)": 1,
    "[16448,16512)": 1,
    "[41088,41152)": 1
}
{
    "prefix": "B",
    "key_hist": {
        "[0,32)": 58,
        "max_len": 17,
        "value_hist": {
            "[0,64)": 58,
            "max_len": 16
        }
    },
    "prefix": "C",
    "key_hist": {
        "[0,32)": 193,
        "max_len": 11,
        "value_hist": {
            "[0,64)": 193,
            "max_len": 10
        }
    },
    "prefix": "L",
    "key_hist": {
        "[0,32)": 2,
        "max_len": 10,
        "value_hist": {
            "[16448,16512)": 1,
            "max_len": 16474,
            "[41088,41152)": 1,
            "max_len": 41118
        }
    },
    "prefix": "M",
    "key_hist": {
        "[0,32)": 14,
        "max_len": 27,
        "value_hist": {
            "[0,64)": 14,
            "max_len": 26
        },
        "[32,64)": 1,
        "max_len": 32,
        "value_hist": {
            "[0,64)": 1,
            "max_len": 17
        }
    },
    "prefix": "P",
    "key_hist": {
        "[0,32)": 1046,
        "max_len": 26,
        "value_hist": {
            "[0,64)": 769,
            "max_len": 33,
            "[128,192)": 85,
            "max_len": 186,
            "[896,960)": 192,
            "max_len": 948
        },
        "[32,64)": 1737,
        "max_len": 42,
        "value_hist": {
            "[0,64)": 193,
            "max_len": 12,
            "[128,192)": 1544,
            "max_len": 187
        }
    },
    "prefix": "S",
    "key_hist": {
        "[0,32)": 8,
        "max_len": 26,
        "value_hist": {
            "[0,64)": 8,
            "max_len": 20
        }
    },
    "prefix": "T",
    "key_hist": {
        "[0,32)": 3,
        "max_len": 10,
        "value_hist": {
            "[0,64)": 3,
            "max_len": 40
        }
    },
    "prefix": "o",
    "key_hist": {
        "[32,64)": 554,
        "max_len": 59,
        "value_hist": {
            "[0,64)": 193,
            "max_len": 30,
            "[64,128)": 356,
            "max_len": 111,
            "[320,384)": 1,
            "max_len": 357,
            "[384,448)": 3,
            "max_len": 445,
            "[576,640)": 1,
            "max_len": 611
        },
        "[64,96)": 106,
        "max_len": 74,
        "value_hist": {
            "[384,448)": 2,
            "max_len": 422,
            "[448,512)": 6,
            "max_len": 473,
            "[512,576)": 94,
            "max_len": 517,
            "[640,704)": 1,
            "max_len": 660,
            "[832,896)": 1,
            "max_len": 855,
            "[896,960)": 1,
            "max_len": 939,
            "[1408,1472)": 1,
            "max_len": 1427
        }
    },
    "prefix": "x",
    "key_hist": {
        "[64,96)": 48,
        "max_len": 79,
        "value_hist": {
            "[0,64)": 8,
            "max_len": 9,
            "[128,192)": 1,
            "max_len": 147,
            "[320,384)": 1,
            "max_len": 339,
            "[448,512)": 1,
            "max_len": 483,
            "[512,576)": 34,
            "max_len": 547,
            "[1088,1152)": 2,
            "max_len": 1098,
            "[1152,1216)": 1,
            "max_len": 1166
        }
    }
}

flush_cache

清空 onode 和 buffer 缓存,因为缓存从磁盘读取,所以也不需要刷新到磁盘,直接清空缓存即可完成动作。

/**
 * @param os 输出流
 * @return 0 for success, other for failure
 */
virtual int flush_cache(std::ostream *os = NULL) { return -1; }
[root@node-1 ~]# ceph daemon /var/run/ceph/ceph-osd.0.asok flush_store_cache

dump_perf_counters

/**
 * @param f, os 输出流
 * @return
 */
virtual void dump_perf_counters(ceph::Formatter *f) {}
virtual void dump_cache_stats(std::ostream& os) {}

dump_cache_stats

/**
 * @param f 输出流
 * @return
 */
virtual void dump_cache_stats(ceph::Formatter *f) {}
[root@node-1 ~]# ceph daemon /var/run/ceph/ceph-osd.0.asok cache status

get_type

返回 OS 类型,如 bluestore。

/**
 * @param
 * @return string 类型字符串
 */
virtual std::string get_type() = 0;

test_mount_in_use

mount 前的预测试,在 OSD::pre_init()中被调用,如若发生错误,则说明 ObjectStore::mount() 不可用。

/**
 * @param
 * @return true for success, false for failure
 */
virtual bool test_mount_in_use() = 0;

mount / umount

挂载/卸载 OS。在 mkfs() 之后使用。

  /**
 * @param 
 * @return 0 for success
 */
  virtual int mount() = 0;
  virtual int umount() = 0;

fsck

对 OS 进行检查或者修复。

/**
 * @param deep true for FSCK_DEEP, false for FSCK_REGULAR
 * @return 0 for success, other for false
 */
virtual int fsck(bool deep)

set_cache_shard

开启缓存。BlueStore 支持自己管理 onode 和 buffer 缓存。此函数用于创建缓存实例,num 是创建的总实例数量。在《Ceph之rados设计原理与实现》p101 页介绍了每个 BlueStore 包含多个 Cache 实例,每个 OSD 相应地会设置多个 PG 工作队列,BlueStore 中的 Cache 实例个数与之对应。

此函数在 BlueStore create() 时被调用。

/**
 * @param num cache 实例总数
 * @return 
 */
virtual void set_cache_shards(unsigned num) 

validate_hobject_key

BlueStore中支持任何长度的 name,所以此函数在 BLueStore 中始终返回0。

/**
 * @param obj hobject 引用
 * @return 0 for valid, other for invalid
 */
virtual int validate_hobject_key(const hobject_t &obj) const = 0;

get_max_attr_name_length

BlueStore 内部对 xattr name 的长度也没有真正限制。这里返回 256。

/**
 * @param 
 * @return unsigned xattr name 最大长度,默认256
 */
virtual unsigned get_max_attr_name_length() = 0;

mkfs

OS 格式化。在 create() 之后使用。内部提供检测机制,支持对一个 OSD 目录多次调用 mkfs()。

/**
 * @param
 * @return 0 for success, other for failure
 */
virtual int mkfs() = 0;

mkjournal | needs_journal | wants_journal | allows_journal

BlueStore 不支持。

virtual int mkjournal() = 0; // journal only
virtual bool needs_journal() = 0;  //< requires a journal
virtual bool wants_journal() = 0;  //< prefers a journal
virtual bool allows_journal() = 0; //< allows a journa

get_min_alloc_size

返回最小分配空间。默认 4KB。支持配置文件修改:bluestore_min_alloc_size,bluestore_min_alloc_size_hdd,bluestore_min_alloc_size_ssd。

/**
 * @param 
 * @return uint64_t 最小分配空间的字节长度
 */
virtual uint64_t get_min_alloc_size() const 

get_device

枚举所有磁盘设备。

 /**
 * @param devls 记录所有磁盘设备的位置
 * @return 0 for success, other for failure
 */
 virtual int get_devices(std::set<std::string> *devls) 

下述命令可以直接调用 get_device 函数获取磁盘设备。

[root@node-1 ~]# ceph daemon /var/run/ceph/ceph-osd.0.asok list_devices
{
    "device": "/dev/sdb"
}

is_sync_onreadable

BlueStore 不支持。

/**
 * @param
 * @return
 */
virtual bool is_sync_onreadable() const

is_rotational

验证 SLOW 设备是 HDD 还是 SSD,ture 为 HDD,false 为 SSD。

/**
 * @param
 * @return true for HDD, false for SSD
 */
virtual bool is_rotational() 

is_journal_rotational

BlueStore 中检查 WAL 设备是 HDD 还是 SSD,true 为 HDD,false 为 SSD。

/**
 * @param
 * @return true for HDD, false for SSD
 */
virtual bool is_journal_rotational()

get_default_device_class

调用 is_rotational() 函数,查询设备的类别:HDD 或者 SSD

/**
 * @param
 * @return string hdd or ssd
 */
virtual std::string get_default_device_class()

get_numa_node

暂不可用


virtual int get_numa_node(
  int *numa_node,
  std::set<int> *nodes,
  std::set<std::string> *failed)

can_sort_nibblewise

BlueStore 不支持

virtual bool can_sort_nibblewise()

statfs

查询 OS 的文件系统信息:

  uint64_t total = 0;                  ///< Total bytes
  uint64_t available = 0;              ///< Free bytes available
  uint64_t internally_reserved = 0;    ///< Bytes reserved for internal purposes

  int64_t allocated = 0;               ///< Bytes allocated by the store

  int64_t data_stored = 0;                ///< Bytes actually stored by the user
  int64_t data_compressed = 0;            ///< Bytes stored after compression
  int64_t data_compressed_allocated = 0;  ///< Bytes allocated for compressed data
  int64_t data_compressed_original = 0;   ///< Bytes that were compressed

  int64_t omap_allocated = 0;         ///< approx usage of omap data
  int64_t internal_metadata = 0;      ///< approx usage of internal metadata
virtual int statfs(struct store_statfs_t *buf,
		     osd_alert_list_t* alerts = nullptr)

可以使用 ceph-objectstore-tool 工具查看 statfs 信息

[root@localhost bin]# ./ceph-objectstore-tool --data-path /root/ceph/build/dev/osd0/ --op statfs --no-mon-config
{
    "total": 108447916032,
    "available": 107373961216,
    "internally_reserved": 0,
    "allocated": 212992,
    "data_stored": 62544,
    "data_compressed": 0,
    "data_compressed_allocated": 0,
    "data_compressed_original": 0,
    "omap_allocated": 972,
    "internal_metadata": 22019124
}

pool_statfs

获取池文件系统信息

  uint64_t total = 0;                  ///< Total bytes
  uint64_t available = 0;              ///< Free bytes available
  uint64_t internally_reserved = 0;    ///< Bytes reserved for internal purposes

  int64_t allocated = 0;               ///< Bytes allocated by the store

  int64_t data_stored = 0;                ///< Bytes actually stored by the user
  int64_t data_compressed = 0;            ///< Bytes stored after compression
  int64_t data_compressed_allocated = 0;  ///< Bytes allocated for compressed data
  int64_t data_compressed_original = 0;   ///< Bytes that were compressed

  int64_t omap_allocated = 0;         ///< approx usage of omap data
  int64_t internal_metadata = 0;      ///< approx usage of internal metadata
virtual int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf, bool *per_pool_omap) = 0;

collect_metadata

获取的信息有:

bluefs 
bluefs_single_shared_device
bluefs_dedicated_db
bluefs_dedicated_wal
objectstore_numa_unknown_devices
objectstore_numa_nodes
virtual void collect_metadata(std::map<std::string,std::string> *pm)

write_meta

BlueStore 中向块设备的超级块写入元数据,以键值对形式。

注:BlueStore 的超级块是 SLOW 设备的第一个 4KB 块。BlueFS 的超级块是 DB 设备(当 DB 不存在时,使用 SLOW设备)的第二个 4KB 块。

  /**
   * write_meta - write a simple configuration key out-of-band
   *
   * Write a simple key/value pair for basic store configuration
   * (e.g., a uuid or magic number) to an unopened/unmounted store.
   * The default implementation writes this to a plaintext file in the
   * path.
   *
   * A newline is appended.
   *
   * @param key key name (e.g., "fsid")
   * @param value value (e.g., a uuid rendered as a std::string)
   * @returns 0 for success, or an error code
   */
  virtual int write_meta(const std::string& key,
			 const std::string& value);

可以使用多种工具查看,这里给出 ceph-objectstore-tool 工具的查看命令:

[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ --op dump-super
{
    "cluster_fsid": "60e065f1-d992-4d1a-8f4e-f74419674f7e",
    "osd_fsid": "9912f587-6c2c-4098-8635-b97fd46f721e",
    "whoami": 0,
    "current_epoch": 156,
    "oldest_map": 1,
    "newest_map": 156,
    "weight": 0,
    "compat": {
        "compat": {},
        "ro_compat": {},
        "incompat": {
            "feature_1": "initial feature set(~v.18)",
            "feature_2": "pginfo object",
            "feature_3": "object locator",
            "feature_4": "last_epoch_clean",
            "feature_5": "categories",
            "feature_6": "hobjectpool",
            "feature_7": "biginfo",
            "feature_8": "leveldbinfo",
            "feature_9": "leveldblog",
            "feature_10": "snapmapper",
            "feature_11": "sharded objects",
            "feature_12": "transaction hints",
            "feature_13": "pg meta object",
            "feature_14": "explicit missing set",
            "feature_15": "fastinfo pg attr",
            "feature_16": "deletes in missing set"
        }
    },
    "clean_thru": 156,
    "last_epoch_mounted": 154
}

read_meta

读取超级块信息

  /**
   * read_meta - read a simple configuration key out-of-band
   *
   * Read a simple key value to an unopened/mounted store.
   *
   * Trailing whitespace is stripped off.
   *
   * @param key key name
   * @param value pointer to value std::string
   * @returns 0 for success, or an error code
   */
  virtual int read_meta(const std::string& key,
			std::string *value);

open_collection

获取 collection。BlueStore 会查询 kvdb 中前缀为 C 的所有 kv 键值对,找到 key 值匹配的集合并返回,若未在 kvdb 命中,也会返回一个 colleciton 指针。

  /**
   * get a collection handle
   *
   * Provide a trivial handle as a default to avoid converting legacy
   * implementations.
   *
   * @param cid 集合id、类型的包装
   * @return CollectionHandle 集合句柄
   */
  virtual CollectionHandle open_collection(const coll_t &cid) = 0;

create_new_collection

创建一个集合,实际就是把集合写入 kvdb,此操作需要通过 queue_transaction() 才能生效。

  /**
   * get a collection handle for a soon-to-be-created collection
   *
   * This handle must be used by queue_transaction that includes a
   * create_collection call in order to become valid.  It will become the
   * reference to the created collection.
   * 
   * @param cid 集合id
   * @return ColletionHandle 集合句柄
   */
  virtual CollectionHandle create_new_collection(const coll_t &cid) = 0;

set_collection_commit_queue

为 collection 设置一个 on_commit 回调函数队列,每个 collection 只有一个该队列。在 /src/osd/OSD.cc 中被调用,只在新建集合、载入集合、分裂集合时才会创建。

/**
   * std::set ContextQueue for a collection
   *
   * After that, oncommits of Transaction will queue into commit_queue.
   * And osd ShardThread will call oncommits.
   * 
   * @param cid 集合id
   * @param commit_queue on_commit 回调函数队列实例
   * @return 
   */
  virtual void set_collection_commit_queue(const coll_t &cid, ContextQueue *commit_queue) = 0;

exist

判断集合中是否存在该对象。

  /**
   * exists -- Test for existance of object
   *
   * @param cid collection for object
   * @param oid oid of object
   * @returns true if object exists, false otherwise
   */
  virtual bool exists(CollectionHandle& c, const ghobject_t& oid) = 0;

set_collection_opts

设置存储池 pool 选项。在 src/osd/PG.cc 中被调用。

具体选项:

    SCRUB_MIN_INTERVAL,
    SCRUB_MAX_INTERVAL,
    DEEP_SCRUB_INTERVAL,
    RECOVERY_PRIORITY,
    RECOVERY_OP_PRIORITY,
    SCRUB_PRIORITY,
    COMPRESSION_MODE,
    COMPRESSION_ALGORITHM,
    COMPRESSION_REQUIRED_RATIO,
    COMPRESSION_MAX_BLOB_SIZE,
    COMPRESSION_MIN_BLOB_SIZE,
    CSUM_TYPE,
    CSUM_MAX_BLOCK,
    CSUM_MIN_BLOCK,
    FINGERPRINT_ALGORITHM,
    PG_NUM_MIN,         // min pg_num
    TARGET_SIZE_BYTES,  // total bytes in pool
    TARGET_SIZE_RATIO,  // fraction of total cluster
    PG_AUTOSCALE_BIAS,
    READ_LEASE_INTERVAL,
    DEDUP_TIER,
    DEDUP_CHUNK_ALGORITHM,
    DEDUP_CDC_CHUNK_SIZE,
  /**
   * set_collection_opts -- std::set pool options for a collectioninformation for an object
   *
   * @param cid collection
   * @param opts new collection options
   * @returns 0 on success, negative error code on failure.
   */
  virtual int set_collection_opts(
    CollectionHandle& c,
    const pool_opts_t& opts) = 0;

stat

获取对象文件属性信息。stat 信息并非全部填写,而是只获取部分。

具体有:

st->st_size = o->onode.size;
st->st_blksize = 4096;
st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
st->st_nlink = 1;
  /**
   * stat -- get information for an object
   *
   * @param cid collection for object
   * @param oid oid of object
   * @param st output information for the object
   * @param allow_eio if false, assert on -EIO operation failure
   * @returns 0 on success, negative error code on failure.
   */
  virtual int stat(
    CollectionHandle &c,
    const ghobject_t& oid,
    struct stat *st,
    bool allow_eio = false) = 0;

read

读取对象数据,可以设置 offset 和 length,默认为读取整个对象。

  /**
   * read -- read a byte range of data from an object
   *
   * Note: if reading from an offset past the end of the object, we
   * return 0 (not, say, -EINVAL).
   *
   * @param cid collection for object
   * @param oid oid of object
   * @param offset location offset of first byte to be read
   * @param len number of bytes to be read
   * @param bl output ceph::buffer::list
   * @param op_flags is CEPH_OSD_OP_FLAG_*
   * @returns number of bytes read on success, or negative error code on failure.
   */
   virtual int read(
     CollectionHandle &c,
     const ghobject_t& oid,
     uint64_t offset,
     size_t len,
     ceph::buffer::list& bl,
     uint32_t op_flags = 0) = 0;

fiemap

分段加载 extent_map,为了支持 readv() 函数。

此函数作用是把 object 对应范围的 extent_map 读取到内存中。返回一个 set,保存了读取的范围。
extent_map 在 BlueStore 中是分片保存在磁盘上,因此需要读取对应分片。此函数可以把一定范围的对象转为 分片范围,读取到内存中。

注意:返回的只是读取范围 [start, end],并不是 extent_map 内容。

  /**
   * fiemap -- get extent std::map of data of an object
   *
   * Returns an encoded std::map of the extents of an object's data portion
   * (std::map<offset,size>).
   *
   * A non-enlightened implementation is free to return the extent (offset, len)
   * as the sole extent.
   *
   * @param cid collection for object
   * @param oid oid of object
   * @param offset location offset of first byte to be read
   * @param len number of bytes to be read
   * @param bl output ceph::buffer::list for extent std::map information.
   * @returns 0 on success, negative error code on failure.
   */
   virtual int fiemap(CollectionHandle& c, const ghobject_t& oid,
		      uint64_t offset, size_t len, ceph::buffer::list& bl) = 0;
   virtual int fiemap(CollectionHandle& c, const ghobject_t& oid,
		      uint64_t offset, size_t len, std::map<uint64_t, uint64_t>& destmap) = 0;

readv

同 read() 类似。区别在于 read() 只能读取一段数据,readv() 支持读取多段数据。

  /**
   * readv -- read specfic intervals from an object;
   * caller must call fiemap to fill in the extent-map first.
   *
   * Note: if reading from an offset past the end of the object, we
   * return 0 (not, say, -EINVAL). Also the default version of readv
   * reads each extent separately synchronously, which can become horribly
   * inefficient if the physical layout of the pushing object get massively
   * fragmented and hence should be overridden by any real os that
   * cares about the performance..
   *
   * @param cid collection for object
   * @param oid oid of object
   * @param m intervals to be read
   * @param bl output ceph::buffer::list
   * @param op_flags is CEPH_OSD_OP_FLAG_*
   * @returns number of bytes read on success, or negative error code on failure.
   */
   virtual int readv(
     CollectionHandle &c,
     const ghobject_t& oid,
     interval_set<uint64_t>& m,
     ceph::buffer::list& bl,
     uint32_t op_flags = 0) 

dump_onode

目前仅在 ceph-objectstore-tool 中被调用到。

  /**
   * dump_onode -- dumps onode metadata in human readable form,
     intended primiarily for debugging
   *
   * @param cid collection for object
   * @param oid oid of object
   * @param section_name section name to create and print under
   * @param f Formatter class instance to print to
   * @returns 0 on success, negative error code on failure.
   */
  virtual int dump_onode(
    CollectionHandle &c,
    const ghobject_t& oid,
    const std::string& section_name,
    ceph::Formatter *f) {
    return -ENOTSUP;
  }

给出 ceph-objectstore-tool 使用范例:

[root@node-1 ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-0/ rbd_header.20e5ff0224ec0 dump
{
    "id": {
        "oid": "rbd_header.20e5ff0224ec0",
        "key": "",
        "snapid": -2,
        "hash": 1624672572,
        "max": 0,
        "pool": 2,
        "namespace": "",
        "max": 0
    },
    "info": {
        "oid": {
            "oid": "rbd_header.20e5ff0224ec0",
            "key": "",
            "snapid": -2,
            "hash": 1624672572,
            "max": 0,
            "pool": 2,
            "namespace": ""
        },
        "version": "137'29",
        "prior_version": "137'28",
        "last_reqid": "osd.1.0:2",
        "user_version": 27,
        "size": 0,
        "mtime": "2021-05-27 09:33:24.367195",
        "local_mtime": "2021-05-27 09:33:24.422271",
        "lost": 0,
        "flags": [
            "dirty",
            "omap",
            "data_digest",
            "omap_digest"
        ],
        "truncate_seq": 0,
        "truncate_size": 0,
        "data_digest": "0xffffffff",
        "omap_digest": "0x4bbef111",
        "expected_object_size": 0,
        "expected_write_size": 0,
        "alloc_hint_flags": 0,
        "manifest": {
            "type": 0
        },
        "watchers": {}
    },
    "stat": {
        "size": 0,
        "blksize": 4096,
        "blocks": 0,
        "nlink": 1
    },
    "SnapSet": {
        "snap_context": {
            "seq": 0,
            "snaps": []
        },
        "clones": []
    }
}

getattr

查询对象的 xattr 属性。通过 kvdb 获取 onode 信息,xattr 保存在每个对象的 onode 中。

支持多种返回类型:ptr、buffer、map<string, ptr>、map<string, list>。

  /**
   * getattr -- get an xattr of an object
   *
   * @param cid collection for object
   * @param oid oid of object
   * @param name name of attr to read
   * @param value place to put output result.
   * @returns 0 on success, negative error code on failure.
   */
  virtual int getattr(CollectionHandle &c, const ghobject_t& oid,
		      const char *name, ceph::buffer::ptr& value) = 0;
  /**
   * getattr -- get an xattr of an object
   *
   * @param cid collection for object
   * @param oid oid of object
   * @param name name of attr to read
   * @param value place to put output result.
   * @returns 0 on success, negative error code on failure.
   */
  int getattr(
    CollectionHandle &c, const ghobject_t& oid,
    const std::string& name, ceph::buffer::list& value) {
    ceph::buffer::ptr bp;
    int r = getattr(c, oid, name.c_str(), bp);
    value.push_back(bp);
    return r;
  }

  /**
   * getattrs -- get all of the xattrs of an object
   *
   * @param cid collection for object
   * @param oid oid of object
   * @param aset place to put output result.
   * @returns 0 on success, negative error code on failure.
   */
  virtual int getattrs(CollectionHandle &c, const ghobject_t& oid,
		       std::map<std::string,ceph::buffer::ptr>& aset) = 0;

  /**
   * getattrs -- get all of the xattrs of an object
   *
   * @param cid collection for object
   * @param oid oid of object
   * @param aset place to put output result.
   * @returns 0 on success, negative error code on failure.
   */
  int getattrs(CollectionHandle &c, const ghobject_t& oid,
	       std::map<std::string,ceph::buffer::list>& aset) {
    std::map<std::string,ceph::buffer::ptr> bmap;
    int r = getattrs(c, oid, bmap);
    for (auto i = bmap.begin(); i != bmap.end(); ++i) {
      aset[i->first].append(i->second);
    }
    return r;
  }

list_collection

查询此 OSD 的所有集合。

  /**
   * list_collections -- get all of the collections known to this ObjectStore
   *
   * @param ls std::list of the collections in sorted order.
   * @returns 0 on success, negative error code on failure.
   */
  virtual int list_collections(std::vector<coll_t>& ls) = 0;

collection_exists

检查 OSD 中是否有该集合。

  /**
   * does a collection exist?
   *
   * @param c collection
   * @returns true if it exists, false otherwise
   */
  virtual bool collection_exists(const coll_t& c) = 0;

collection_empty

检查集合是否为空(没有对象)?

  /**
   * is a collection empty?
   *
   * @param c collection
   * @param empty true if the specified collection is empty, false otherwise
   * @returns 0 on success, negative error code on failure.
   */
  virtual int collection_empty(CollectionHandle& c, bool *empty) = 0;

collection_bits

对象在进行 crush 运算映射到某个 pg 时,因为 pg 的数量总是有限的,因此不需要对整个对象 id 进行 hash 映射,只需要取最后的 n 位(2^n = pg 数量),即 n 位表示对象通过 stable_mod 映射至 pg 时,其32位全精度哈希值(从最低位开始)有多少位是有效的。这一概念在《Ceph之Rados设计原理与实现》一书中P11 ~ P13页有详细介绍。

  /**
   * return the number of significant bits of the coll_t::pgid.
   *
   * This should return what the last create_collection or split_collection
   * std::set.  A legacy backend may return -EAGAIN if the value is unavailable
   * (because we upgraded from an older version, e.g., FileStore).
   */
  virtual int collection_bits(CollectionHandle& c) = 0;

collection_list

列出集合中指定范围的对象。

  /**
   * std::list contents of a collection that fall in the range [start, end) and no more than a specified many result
   *
   * @param c collection
   * @param start list object that sort >= this value
   * @param end list objects that sort < this value
   * @param max return no more than this many results
   * @param seq return no objects with snap < seq
   * @param ls [out] result
   * @param next [out] next item sorts >= this value
   * @return zero on success, or negative error
   */
  virtual int collection_list(CollectionHandle &c,
			      const ghobject_t& start, const ghobject_t& end,
			      int max,
			      std::vector<ghobject_t> *ls, ghobject_t *next) = 0;
  virtual int collection_list_legacy(CollectionHandle &c,
                                     const ghobject_t& start,
                                     const ghobject_t& end, int max,
                                     std::vector<ghobject_t> *ls,
                                     ghobject_t *next) {
    return collection_list(c, start, end, max, ls, next);
  }

omap_get

查询指定对象的 omap 属性。

omap 在kvdb 的保存形式为:prefix : key :value。每个对象的 omap 单独保存在 kvdb 中,前缀为 M。

  virtual int omap_get(
    CollectionHandle &c,     ///< [in] Collection containing oid
    const ghobject_t &oid,   ///< [in] Object containing omap
    ceph::buffer::list *header,      ///< [out] omap header
    std::map<std::string, ceph::buffer::list> *out /// < [out] Key to value std::map
    ) = 0;

omap_get_header

获取指定对象的 omap_header(每个对象只有一个)。每个对象的 omap_header 单独保存在 kvdb 中,前缀为 M.

  virtual int omap_get_header(
    CollectionHandle &c,     ///< [in] Collection containing oid
    const ghobject_t &oid,   ///< [in] Object containing omap
    ceph::buffer::list *header,      ///< [out] omap header
    bool allow_eio = false ///< [in] don't assert on eio
    ) = 0;

omap_get_keys

计算对象 omap 在 kvdb 中的 key 值。

  virtual int omap_get_keys(
    CollectionHandle &c,   ///< [in] Collection containing oid
    const ghobject_t &oid, ///< [in] Object containing omap
    std::set<std::string> *keys      ///< [out] Keys defined on oid
    ) = 0;

omap_get_values

获取指定对象的某个 key 对应的 value。

  virtual int omap_get_values(
    CollectionHandle &c,         ///< [in] Collection containing oid
    const ghobject_t &oid,       ///< [in] Object containing omap
    const std::set<std::string> &keys,     ///< [in] Keys to get
    std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
    ) = 0;

omap_check_keys

检查对象的 omap 在 kvdb 的键值对中是否有指定 key(即检查该 key 是否存在于 kvdb 中)。支持批量检查。

  /// Filters keys into out which are defined on oid
  int omap_check_keys(
      CollectionHandle &c,                ///< [in] Collection containing oid
      const ghobject_t &oid,   ///< [in] Object containing omap
      const std::set <std::string> &keys, ///< [in] Keys to check
      std::set <std::string> *out         ///< [out] Subset of keys defined on oid
  )

get_omap_iterator

指定一个前缀,返回该前缀的 kvdb 迭代器。通过此迭代器,可以方便的访问 kvdb 内部数据。

  /**
   * Returns an object map iterator
   *
   * Warning!  The returned iterator is an implicit lock on filestore
   * operations in c.  Do not use filestore methods on c while the returned
   * iterator is live.  (Filling in a transaction is no problem).
   *
   * @return iterator, null on error
   */
  virtual ObjectMap::ObjectMapIterator get_omap_iterator(
    CollectionHandle &c,   ///< [in] collection
    const ghobject_t &oid  ///< [in] object
    ) = 0;

Ceph 中一般使用 RocksDB,并进行了二次封装。Ceph 封装后的 kvdb 迭代器提供了以下功能:

seek_to_first():位置调整到第一个key,如果该前缀对应的内容不为空,则 iterator.valid() 为 true,否则为false。
seek_to_last():位置调整到最后一个key。如果该前缀的内容为空,则 iterator.valid() 为 false,否则为 true。
upper_bound(const string &after):查找 key,位置调整到 after 的下一个位置。
lower_bound(const string &to):查找 key,位置调整到 to 的位置。
next():位置向后调整一位。
prev():位置向前调整一位。
valid():此位置时候有效。无效说明不存在,或者越过边界。
key():获取迭代器位置对应的 key。
raw_key():获取迭代器位置对应的 (prefix + key)。
value():获取迭代器位置的 value,返回一个 bufferlist。
value_as_ptr():获取迭代器位置的 value,返回一个 buffer_ptr。
status():迭代器位置是否有效,无效说明不存在,或者越过边界。

flush_journal

BlueStore 不支持。

virtual int flush_journal() { return -EOPNOTSUPP; }

dump_journal

BlueStore 不支持。

virtual int dump_journal(std::ostream& out) { return -EOPNOTSUPP; }

snapshot

BlueStore 不支持。

virtual int snapshot(const std::string& name) { return -EOPNOTSUPP; }

set_fsid

设置 OSD 的 fsid。

注:此修改仅影响正在运行的 OSD 进程,不修改 /var/lib/ceph/osd/fsid 文件保存的内容。

  /**
   * Set and get internal fsid for this instance. No external data is modified
   */
  virtual void set_fsid(uuid_d u) = 0;

get_fsid

获取 OSD 进程中 fsid。

estimate_objects_overhead

评估对象需要使用多少的额外空间(除了其自身的 data 数据)。

计算方式:额外空间 = 对象数量 * 300 byte

  /**
  * Estimates additional disk space used by the specified amount of objects and caused by file allocation granularity and metadata store
  * - num objects - total (including witeouts) object count to measure used space for.
  */
  virtual uint64_t estimate_objects_overhead(uint64_t num_objects) = 0;

inject_data_error

virtual void inject_data_error(const ghobject_t &oid) {}

inject_mdata_error

virtual void inject_mdata_error(const ghobject_t &oid) {}

compact

压缩 kvdb 空间。RocksDB 使用 apend 追加写方式记录数据,会产生大量的重复数据,通过此函数可以压缩空间。

virtual void compact() {}

has_builtin_csum

BlueStore 支持数据校验,返回值始终为 true。

virtual bool has_builtin_csum() const 

Transaction api

Objectstore 中提供了 queue_transactions() 函数,此函数的作用是把 op 事务提交到存储引擎中,所有涉及数据改写操作都需要使用 Transaction。因此,了解 Transaction 的功能才能正真体会 ObjectStore 对外提供了哪些能力。

register_on_applied_sync

注册 on_applied 回调函数。在事务应用后同步执行。具体时机在 queue_transaction() 完成末尾调用。
on_applied:事务应用完成后的回调函数,即事务已经在存储引擎中生效,但此时数据并不一定已经落盘。

/**
 * @param c 回调函数
 * @return 
 */
void register_on_applied_sync(Context *c) 

register_on_applied

注册 on_applied 异步执行的回调函数。在事务应用后异步执行,由 finisher 线程处理。

/**
 * @param c 回调函数
 * @return 
 */
void register_on_applied(Context *c) 

register_on_commit

注册 on_commit 异步执行的回调函数。在数据写入 WAL 盘后执行,由 finisher 线程处理。
on_commit:事务提交后的回调函数。数据虽然没有写入数据盘,但是已经写入日志盘。

/**
 * @param c 回调函数
 * @return 
 */
void register_on_commit(Context *c)

register_on_complete

同时注册 on_applied 和 on_commit 两种回调函数。

/**
 * @param c 回调函数
 * @return 
 */
void register_on_complete(Context *c)

has_contexts

检查是否有 on_commit、on_applied 或者 on_applied_sync 回调函数,若有任意一个存在,则返回 true,否则 false。

/**
 * @param 
 * @return true for exist, other for none
 */
bool has_contexts() const

collect_contexts

解析出 transaction 中的三种回调函数。

/**
 * @param t 事务,其中保存了回调函数
 * @param out_on_applied 用于存放 on_applied
 * @param out_on_commit 用于存放 on_commit
 * @param out_on_applied_sync 用于存放 on_applied_sync
 * @return 
 */
static void collect_contexts(
        std::vector<Transaction> &t,
        Context **out_on_applied,
        Context **out_on_commit,
        Context **out_on_applied_sync)
    
static void collect_contexts(
        std::vector<Transaction> &t,
        std::list<Context *> *out_on_applied,
        std::list<Context *> *out_on_commit,
        std::list<Context *> *out_on_applied_sync) 

注:以下操作都需要通过 queue_transaction() 提交

create

创建一个对象。此时对象 data 为空。

    /**
     * create
     *
     * create an object that does not yet exist
     * (behavior is undefined if the object already exists)
     *
     * @param cid 集合id
     * @param oid 对象id
     * @return
     */
    void create(const coll_t &cid, const ghobject_t &oid) 

touch

创建一个对象,等同于 create()。此时对象 data 为空。

    /**
     * touch
     *
     * Ensure the existance of an object in a collection. Create an
     * empty object if necessary
     *
     * @param cid 集合id 
     * @param oid 对象id
     * @return
     */
    void touch(const coll_t &cid, const ghobject_t &oid)

write

向一个对象写入数据。注:可以存在文件空洞。

    /**
     * Write data to an offset within an object. If the object is too
     * small, it is expanded as needed.  It is possible to specify an
     * offset beyond the current end of an object and it will be
     * expanded as needed. Simple implementations of ObjectStore will
     * just zero the data between the old end of the object and the
     * newly provided data. More sophisticated implementations of
     * ObjectStore will omit the untouched data and store it as a
     * "hole" in the file.
     *
     * Note that a 0-length write does not affect the size of the object.
     *
     * @param cid 集合id
     * @param oid 对象id
     * @param off 待操作的对象 data 偏移量
     * @param len 待操作的对象 data 长度
     * @param write_data 写入的 data 数据
     * @param flags 标志
     * @return
     */
    void write(const coll_t &cid, const ghobject_t &oid, uint64_t off, uint64_t len,
               const ceph::buffer::list &write_data, uint32_t flags = 0)

zero

归零。

    /**
     * zero out the indicated byte range within an object. Some
     * ObjectStore instances may optimize this to release the
     * underlying storage space.
     *
     * If the zero range extends beyond the end of the object, the object
     * size is extended, just as if we were writing a buffer full of zeros.
     * EXCEPT if the length is 0, in which case (just like a 0-length write)
     * we do not adjust the object size.
     *
     * @param cid 集合id
     * @param oid 对象id
     * @param off 待操作的对象 data 偏移量
     * @param len 待操作的对象 data 长度
     * @return 
     */
    void zero(const coll_t &cid, const ghobject_t &oid, uint64_t off, uint64_t len)

truncate

截取对象,只保留[0, off]范围的对象。

    /** Discard all data in the object beyond the specified size.
     *
     * @param cid 集合id
     * @param oid 对象id
     * @param off 超过此长度的全部截掉(删除)。
     * @return 
     */
    void truncate(const coll_t &cid, const ghobject_t &oid, uint64_t off)

remove

删除一个对象。

    /**
     * Remove an object. All four parts of the object are removed.
     *
     * @prarm cid 集合id
     * @param oid 对象id
     * @return 
     */ 
    void remove(const coll_t &cid, const ghobject_t &oid) {
      Op *_op = _get_next_op();
      _op->op = OP_REMOVE;
      _op->cid = _get_coll_id(cid);
      _op->oid = _get_object_id(oid);
      data.ops = data.ops + 1;
    }

setattr、setattrs

为指定对象设置一个 xatrr 属性,xattr 属性以键值对的形式保存在对象中,具体位置在 bluestore_onode_t 结构体中。

    /**
     * @prarm cid 集合id
     * @param oid 对象id
     * @param name、s 即 xattr 中的 key
     * @param val 即 xattr 中的 value
     * @param attrset 即 xattr 中 <key, value> 键值对
     * @return 
     */ 
    /// Set an xattr of an object
    void setattr(const coll_t &cid, const ghobject_t &oid, const char *name, ceph::buffer::list &val) 
    
    /// Set an xattr of an object
    void setattr(const coll_t &cid, const ghobject_t &oid, const std::string &s, ceph::buffer::list &val) 
        
    /// Set multiple xattrs of an object
    void setattrs(const coll_t &cid, const ghobject_t &oid, const std::map<std::string, ceph::buffer::ptr> &attrset) 

    /// Set multiple xattrs of an object
    void setattrs(const coll_t &cid, const ghobject_t &oid, const std::map<std::string, ceph::buffer::list> &attrset) 

rmattr、rmattrs

删除 xattr 属性。rmattr() 是删除指定的单个 xattr 属性, rmattrs() 则是删除一个对象的所有 xattrs 属性。

    /**
     * @prarm cid 集合id
     * @param oid 对象id
     * @param name、s 即 xattr 中的 key
     * @param val 即 xattr 中的 value
     * @return 
     */ 
	/// remove an xattr from an object
    void rmattr(const coll_t &cid, const ghobject_t &oid, const char *name) 

    /// remove an xattr from an object
    void rmattr(const coll_t &cid, const ghobject_t &oid, const std::string &s) 

    /// remove all xattrs from an object
    void rmattrs(const coll_t &cid, const ghobject_t &oid) 
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值