【ClickHouse源码】ReplicatedMergeTree之表创建流程

ReplicatedMergeTree之表创建流程

复制流是后台进行的,是异步的,在ReplicatedMergeTree创建时会自动启动很多个异步task,可以看下其构造函数

StorageReplicatedMergeTree::StorageReplicatedMergeTree(
    const String & zookeeper_path_,
    const String & replica_name_,
    bool attach,
    const StorageID & table_id_,
    const String & relative_data_path_,
    const StorageInMemoryMetadata & metadata,
    Context & context_,
    const String & date_column_name,
    const MergingParams & merging_params_,
    std::unique_ptr<MergeTreeSettings> settings_,
    bool has_force_restore_data_flag)
    : MergeTreeData(table_id_,
                    relative_data_path_,
                    metadata,
                    context_,
                    date_column_name,
                    merging_params_,
                    std::move(settings_),
                    true,                   /// require_part_metadata
                    attach,
                    [this] (const std::string & name) { enqueuePartForCheck(name); })
    , zookeeper_path(global_context.getMacros()->expand(zookeeper_path_, table_id_.database_name, table_id_.table_name))
    , replica_name(global_context.getMacros()->expand(replica_name_, table_id_.database_name, table_id_.table_name))
    , reader(*this)
    , writer(*this)
    , merger_mutator(*this, global_context.getBackgroundPool().getNumberOfThreads())
    , queue(*this)
    , fetcher(*this)
    , cleanup_thread(*this)
    , part_check_thread(*this)
    , restarting_thread(*this)
{
    // 在zk上创建相应节点
    if (!zookeeper_path.empty() && zookeeper_path.back() == '/')
        zookeeper_path.resize(zookeeper_path.size() - 1);

    if (!zookeeper_path.empty() && zookeeper_path.front() != '/')
        zookeeper_path = "/" + zookeeper_path;
    replica_path = zookeeper_path + "/replicas/" + replica_name;

    // queueUpdatingTask
    queue_updating_task = global_context.getSchedulePool().createTask(getStorageID().getFullTableName() + " (StorageReplicatedMergeTree::queueUpdatingTask)", [this]{ queueUpdatingTask(); });
    
    // mutationsUpdatingTask
    mutations_updating_task = global_context.getSchedulePool().createTask(getStorageID().getFullTableName() + " (StorageReplicatedMergeTree::mutationsUpdatingTask)", [this]{ mutationsUpdatingTask(); });

    // mergeSelectingTask
    merge_selecting_task = global_context.getSchedulePool().createTask(getStorageID().getFullTableName() + " (StorageReplicatedMergeTree::mergeSelectingTask)", [this] { mergeSelectingTask(); });
    /// 此task先设置为不激活状态,如果成为leader再激活
    merge_selecting_task->deactivate();

    // mutationsFinalizingTask
    mutations_finalizing_task = global_context.getSchedulePool().createTask(getStorageID().getFullTableName() + " (StorageReplicatedMergeTree::mutationsFinalizingTask)", [this] { mutationsFinalizingTask(); });

    if (global_context.hasZooKeeper())
        current_zookeeper = global_context.getZooKeeper();

    bool skip_sanity_checks = false;

    // 是不是需要强制恢复数据
    if (current_zookeeper && current_zookeeper->exists(replica_path + "/flags/force_restore_data"))
    {
        skip_sanity_checks = true;
        current_zookeeper->remove(replica_path + "/flags/force_restore_data");

        LOG_WARNING(log, "Skipping the limits on severity of changes to data parts and columns (flag "
            << replica_path << "/flags/force_restore_data).");
    }
    else if (has_force_restore_data_flag)
    {
        skip_sanity_checks = true;

        LOG_WARNING(log, "Skipping the limits on severity of changes to data parts and columns (flag force_restore_data).");
    }

    // 开始从其他副本恢复数据
    loadDataParts(skip_sanity_checks);

    // 不存在zk,是通过是否存在zookeeper节点判断的
    if (!current_zookeeper)
    {
        if (!attach)
            throw Exception("Can't create replicated table without ZooKeeper", ErrorCodes::NO_ZOOKEEPER);

        /// Do not activate the replica. It will be readonly.
        LOG_ERROR(log, "No ZooKeeper: table will be in readonly mode.");
        is_readonly = true;
        return;
    }

    // 如果是attach命令,且不存在/metadata路径
    if (attach && !current_zookeeper->exists(zookeeper_path + "/metadata"))
    {
        LOG_WARNING(log, "No metadata in ZooKeeper: table will be in readonly mode.");
        is_readonly = true;
        return;
    }

    // 如果不是attach
    if (!attach)
    {
        // 且数据parts存在,则报错
        if (!getDataParts().empty())
            throw Exception("Data directory for table already containing data parts - probably it was unclean DROP table or manual intervention. You must either clear directory by hand or use ATTACH TABLE instead of CREATE TABLE if you need to use that parts.", ErrorCodes::INCORRECT_DATA);

        // 如果parts不存在执行以下代码
        // 创建表
        createTableIfNotExists();
        // 检查表结构,包含matadata和colums
        checkTableStructure(zookeeper_path);

        Coordination::Stat metadata_stat;
        current_zookeeper->get(zookeeper_path + "/metadata", &metadata_stat);
        metadata_version = metadata_stat.version;
        // 在zk上创建replicas及其子节点
        createReplica();
    }
    else
    {
        // 如果是attach命令执行以下代码
        // 检查表结构和数据parts
        checkTableStructure(replica_path);
        checkParts(skip_sanity_checks);

        if (current_zookeeper->exists(replica_path + "/metadata_version"))
        {
            metadata_version = parse<int>(current_zookeeper->get(replica_path + "/metadata_version"));
        }
        else 
        {
            Coordination::Stat metadata_stat;
            current_zookeeper->get(zookeeper_path + "/metadata", &metadata_stat);
            metadata_version = metadata_stat.version;
        }
        // 清理过时的临时节点
        clearOldTemporaryDirectories(0);
    }
    // 创建quorum、mutations等相关节点
    createNewZooKeeperNodes();
    other_replicas_fixed_granularity = checkFixedGranualrityInZookeeper();
}

通过以上代码可以知道在创建了ReplicatedMergeTree时,就创建了4个TaskHolder,可以类似理解成一个线程池的执行器,添加的task都是在后台SchedulePool中执行的,其各部分主要作用是

queue_updating_task:负责跟踪所有副本日志中的更新并将其加载到queue中

mutations_updating_task:负责跟踪所有副本日志中的更新并将其加载到mutations中

merge_selecting_task:负责merge任务的选择

mutations_finalizing_task:复制标记mutation任务的状态为done

queue_updating_task为例,在创建表时将queueUpdatingTask()添加到了queue_updating_task中,所以会执行这个方法,这个方法主要是实现在表创建后触发一次数据clone,因为可能同分片其他副本已经存在同样表了,在新副本创建该表就要及时同步数据,保证数据一致。

void StorageReplicatedMergeTree::queueUpdatingTask()
{
    if (!queue_update_in_progress)
    {
        last_queue_update_start_time.store(time(nullptr));
        queue_update_in_progress = true;
    }
    try
    {
        // 这里开始执行clone的操作,获取log_pointer指针,获取缺少的log(在log节点
        // 里的log-xxxxx),将这些log添加到queue节点里等操作
        queue.pullLogsToQueue(getZooKeeper(), queue_updating_task->getWatchCallback());
        last_queue_update_finish_time.store(time(nullptr));
        queue_update_in_progress = false;
    }
    catch (const Coordination::Exception & e)
    {
        ...
    }
    catch (...)
    {
        ...
    }
}

上面的过程不会循环执行,只会执行一次,如果失败会做一些异常处理,但是异常处理并不能说返回异常就可以了,还要能够有恢复正常的措施,可以找到在dbms/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp中,就有类似逻辑,它的功能是重新启动ReplicatedMergeTree的后台线程,重新做表初始化及zk session的创建,它的run()方法就会进入到以下流程中,看到Restarting这个名字也可以表明是这个意思

bool ReplicatedMergeTreeRestartingThread::tryStartup()
{
    try
    {
        removeFailedQuorumParts();
        activateReplica();

        const auto & zookeeper = storage.getZooKeeper();
        const auto storage_settings = storage.getSettings();

        storage.cloneReplicaIfNeeded(zookeeper);

        storage.queue.load(zookeeper);

        // 上面是做一些开始的判断和操作,比如设置is_active,是不是需要clone数据
        // 这里开始执行clone的操作,获取log_pointer指针,获取缺少的log(在log节点
        // 里的log-xxxxx),将这些log添加到queue节点里
        storage.queue.pullLogsToQueue(zookeeper);
        storage.queue.removeCurrentPartsFromMutations();
        storage.last_queue_update_finish_time.store(time(nullptr));

        updateQuorumIfWeHavePart();

        if (storage_settings->replicated_can_become_leader)
            storage.enterLeaderElection();
        else
            LOG_INFO(log, "Will not enter leader election because replicated_can_become_leader=0");

        storage.partial_shutdown_called = false;
        storage.partial_shutdown_event.reset();

        // 激活相关task
        storage.queue_updating_task->activateAndSchedule();
        storage.mutations_updating_task->activateAndSchedule();
        storage.mutations_finalizing_task->activateAndSchedule();
        storage.cleanup_thread.start();
        storage.part_check_thread.start();

        return true;
    }
    catch (...)
    {
        ...
    }
}

至此,ReplicatedMergeTree的创建过程就全部完成了,也只是ReplicatedMergeTree的第一步,这部分主要是做了如下几个点:

  • 创建相关的本地表路径及detachedformat_version.txt
  • 创建了相关的zk节点
  • 根据不同条件load表的数据

这里可能会有疑惑,代码里没有创建本地表相关的代码,如何实现创建的?

原因在这里

class StorageReplicatedMergeTree : public ext::shared_ptr_helper<StorageReplicatedMergeTree>, public MergeTreeData

StorageReplicatedMergeTree继承了MergeTreeData,仔细看上面的构造方法也做了相应的传参,所以可以看下如下方法

MergeTreeData::MergeTreeData(
    const String & database_,
    const String & table_,
    const ColumnsDescription & columns_,
    const IndicesDescription & indices_,
    const ConstraintsDescription & constraints_,
    Context & context_,
    const String & date_column_name,
    const ASTPtr & partition_by_ast_,
    const ASTPtr & order_by_ast_,
    const ASTPtr & primary_key_ast_,
    const ASTPtr & sample_by_ast_,
    const ASTPtr & ttl_table_ast_,
    const MergingParams & merging_params_,
    std::unique_ptr<MergeTreeSettings> storage_settings_,
    bool require_part_metadata_,
    bool attach,
    BrokenPartCallback broken_part_callback_)
    : global_context(context_)
    , merging_params(merging_params_)
    , partition_by_ast(partition_by_ast_)
    , sample_by_ast(sample_by_ast_)
    , ttl_table_ast(ttl_table_ast_)
    , require_part_metadata(require_part_metadata_)
    , database_name(database_)
    , table_name(table_)
    , broken_part_callback(broken_part_callback_)
    , log_name(database_name + "." + table_name)
    , log(&Logger::get(log_name))
    , storage_settings(std::move(storage_settings_))
    , storage_policy(context_.getStoragePolicy(getSettings()->storage_policy))
    , data_parts_by_info(data_parts_indexes.get<TagByInfo>())
    , data_parts_by_state_and_info(data_parts_indexes.get<TagByStateAndInfo>())
    , parts_mover(this)
{
    ...

    // 这里就是创建本地表文件的逻辑
    auto paths = getDataPaths();
    for (const String & path : paths)
    {
        Poco::File(path).createDirectories();
        Poco::File(path + "detached").createDirectory();
        if (Poco::File{path + "format_version.txt"}.exists())
        {
            if (!version_file_path.empty())
            {
                LOG_ERROR(log, "Duplication of version file " << version_file_path << " and " << path << "format_file.txt");
                throw Exception("Multiple format_version.txt file", ErrorCodes::CORRUPTED_DATA);
            }
            version_file_path = path + "format_version.txt";
        }
    }

    ...
}

这里主要是为了了解ReplicatedMergeTree的创建,所以略去了一些关于MergeTree的逻辑,但是ReplicatedMergeTree也是基于MergeTree的,有兴趣可以自行了解下。

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

一只努力的微服务

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值