OSD启动过程中osdmap加载流程
OSD启动入口是ceph_osd.cc的main函数,他会调用osd->init()进行osd启动前的初始化工作。
int OSD::init()
{ ......//检查osd目录相关持久化数据,以及文件系统属性等,加载FileStore驱动。 int r = store->mount();
......//这个是读取的current/meta目录下的osd_superblock_xxx文件,而不是osd根目录下的superblock文件(这个是在上面的mount函数里读取的)
r = read_superblock();
/*
(gdb) p superblock
$1 = {cluster_fsid = {uuid = "#\214;EȄI\021\224+\244$\221\002P\277"}, osd_fsid = {
uuid = "\216+\004F\354)B\033\263\023\320\304\022\220\374", <incomplete sequence \342>}, whoami = 1, current_epoch = 20, oldest_map = 1, newest_map = 20,
weight = 0, compat_features = {compat = {mask = 1, names = std::map with 0 elements}, ro_compat = {mask = 1, names = std::map with 0 elements}, incompat = {
mask = 14335, names = std::map with 12 elements = {[1] = "initial feature set(~v.18)", [2] = "pginfo object", [3] = "object locator",
[4] = "last_epoch_clean", [5] = "categories", [6] = "hobjectpool", [7] = "biginfo", [8] = "leveldbinfo", [9] = "leveldblog", [10] = "snapmapper",
[12] = "transaction hints", [13] = "pg meta object"}}}, mounted = 12, clean_thru = 20, last_map_marked_full = 0}
*/......//加载osd down之前保存的最新版本osdmap,具体过程见下面分析 osdmap = get_map(superblock.current_epoch);
...... //加载OSD上已有的pg,具体见下面分析load up pgs (as they previously existed) load_pgs();
......//启动osd的peering线程池 osd_tp.start();
......//消费osdmap,或者说使用osdmap,具体见下面分析 consume_map();
......// 设置osd状态为STATE_BOOTING,OSD启动过程中共有STATE_INITIALIZING(默认值)、STATE_BOOTING、STATE_ACTIVE这几个状态阶段 set_state(STATE_BOOTING);// 准备启动OSD,具体见下面分析 start_boot();
......
}
加载osdmap的 get_map说明:
class OSD: {
......
// osd map cache (past osd maps)
OSDMapRef get_map(epoch_t e) {
return service.get_map(e);
}
......
}class OSDService: {
......
OSDMapRef get_map(epoch_t e) {
OSDMapRef ret(try_get_map(e));
assert(ret);
return ret;
}
......
}OSDMapRef OSDService::try_get_map(epoch_t epoch) {
Mutex::Locker l(map_cache_lock); //从osdmap缓存查找该版本的map是否存在
OSDMapRef retval = map_cache.lookup(epoch);if (retval) {
dout(30) < < "get_map " << epoch << " -cached" << dendl;
return retval;
}OSDMap *map = new OSDMap;
if (epoch > 0) {
dout(20) < < "get_map " << epoch << " - loading and decoding " << map << dendl;
bufferlist bl; //从osdmap的bufferlist缓存(map_bl_cache)中查找该版本map是否存在,如果不存在则从硬盘上加载,并加入map_bl_cache缓存
if (!_get_map_bl(epoch, bl)) {
delete map;
return OSDMapRef();
}
//解码bufferlist数据到osdmap
map->decode(bl);
} else {
dout(20) < < "get_map " << epoch << " - return initial " << map << dendl;
}
// 把获取的osdmap加入map_cache缓存
return _add_map(map);
}
上述osdmap加载过程中涉及到两个内存缓存:map_cache和map_bl_cache(还有一个map_bl_inc_cache是保存增量osdmap的bufferlist的缓存),这两个缓存都是基于LRU算法(Least Recently Used) ,在OSDService类的构造函数中初始化的,默认的缓存空间大小(缓存项最大数量)是由配置项osd_map_cache_size决定的,其默认值是500,因此在启动过程中缓存的osdmap数量是足够的(根据实际线程环境osdmap变化速度,有运维操作时版本变化量是150左右,osdmap变化数量跟osd状态变化次数强相关,没有操作时基本不变)。
加载OSD上已有的pg:
void OSD::load_pgs()
{
assert(osd_lock.is_locked());
dout(0) << "load_pgs" << dendl;
{
RWLock::RLocker l(pg_map_lock);
assert(pg_map.empty());
}
vector<coll_t> ls;
int r = store->list_collections(ls);//遍历current目录下所有文件夹,也即pg
if (r < 0) {
derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
}
......
// pgs是从ls中加载的pg列表
for (map<spg_t, interval_set<snapid_t> >::iterator i = pgs.begin();
i != pgs.end();
++i) {
spg_t pgid(i->first);
......
bufferlist bl;
epoch_t map_epoch = 0;
// 从omap获取pg关联的osdmap版本,可以认为是osd down之前保存的最新osdmap版本
int r = PG::peek_map_epoch(store, pgid, &map_epoch, &bl);
......
PG *pg = NULL;
if (map_epoch > 0) {
OSDMapRef pgosdmap = service.try_get_map(map_epoch); // 参考上面的分析过程
......
pg = _open_lock_pg(pgosdmap, pgid);
} else {
pg = _open_lock_pg(osdmap, pgid); //打开pg对象并加锁
}
......
// read pg state, log
pg->read_state(store, bl); // 从omap中读取pg info和pg log
......
pg->handle_loaded(&rctx); // 使pg状态机进入Reset状态,为进入peering状态做准备
......
}
PG *OSD::_open_lock_pg(
OSDMapRef createmap,
spg_t pgid, bool no_lockdep_check)
{
assert(osd_lock.is_locked());
PG* pg = _make_pg(createmap, pgid);
{
RWLock::WLocker l(pg_map_lock);
pg->lock(no_lockdep_check);
pg_map[pgid] = pg; // 把pg保存到pg_map
pg->get("PGMap"); // because it's in pg_map
service.pg_add_epoch(pg->info.pgid, createmap->get_epoch());
}
return pg;
}
使用osdmap:
void OSD::consume_map()
{
......
// scan pg's
{
RWLock::RLocker l(pg_map_