/// parts should be sorted.
MergeTreeData::MutableDataPartPtr MergeTreeDataMergerMutator::mergePartsToTemporaryPart(
const FutureMergedMutatedPart & future_part,
const StorageMetadataPtr & metadata_snapshot,
MergeList::Entry & merge_entry,
TableLockHolder &,
time_t time_of_merge,
const Context & context,
const ReservationPtr & space_reservation,
bool deduplicate,
const Names & deduplicate_by_columns)
{
static const String TMP_PREFIX = "tmp_merge_";
/// 1.判断merges_blocker是否被取消
if (merges_blocker.isCancelled())
throw Exception("Cancelled merging parts", ErrorCodes::ABORTED);
/// 2.检查是不是TTL类型的Merge
if (isTTLMergeType(future_part.merge_type) && ttl_merges_blocker.isCancelled())
throw Exception("Cancelled merging parts with TTL", ErrorCodes::ABORTED);
const MergeTreeData::DataPartsVector & parts = future_part.parts;
/// 3.如果需要去重,需要根据哪些列进行去重
if (deduplicate)
{
if (deduplicate_by_columns.empty())
LOG_DEBUG(log, "DEDUPLICATE BY all columns");
else
LOG_DEBUG(log, "DEDUPLICATE BY ('{}')", fmt::join(deduplicate_by_columns, "', '"));
}
......
/// 4.选出merging_columns和gathering_columns
/// merging_columns包含三类:
/// 1.sort_key(order by所包含的列,如果primary_key不存在,默认同sort_key)、
/// 2.index(secondary_indices也就是常说的二级索引或跳数索引)
/// 3.高阶Merge的特殊列,如sign_column(用于Collapsing和VersionedCollapsing模式的去重),
/// version_column(用于Replacing模式的去重)
/// 如果没有上述任何列,将在存在的物理列中选择第一个作为merging_column
/// gathering_columns是排除以上逻辑选择后剩余的物理上存在的列
extractMergingAndGatheringColumns(
storage_columns,
metadata_snapshot->getSortingKey().expression,
metadata_snapshot->getSecondaryIndices(),
data.merging_params, // 有Ordinary、Collapsing、Summing、Aggregating、Replacing、Graphite,VersionedCollapsing这7种,mode是在表引擎注册时就制定了的
gathering_columns, // 非PK列
gathering_column_names, // 非PK列name
merging_columns, // PK列,包含PK列、sort列、index列
merging_column_names); // PK列name
auto single_disk_volume = std::make_shared<SingleDiskVolume>("volume_" + future_part.name, disk, 0);
/// 5.创建新的part
MergeTreeData::MutableDataPartPtr new_data_part = data.createPart(
future_part.name,
future_part.type,
future_part.part_info,
single_disk_volume,
TMP_PREFIX + future_part.name);
new_data_part->uuid = future_part.uuid;
new_data_part->setColumns(storage_columns);
new_data_part->partition.assign(future_part.getPartition());
new_data_part->is_temp = true;
bool need_remove_expired_values = false;
bool force_ttl = false;
/// 6.循环所有要合并的part,判断是否需要设置ttl信息,并判断是否需要强制执行这些part的ttl
/// checkAllTTLCalculated方法是检查这些part是否被计算过ttl,如果part的ttl信息没与meatadata中的
/// ttl信息一致,就需要后续强制执行ttl,ttl包含ColumnTTL、MoveTTL、GroupByTTL、RowsWhereTTL
/// 详细用法见官网:https://clickhouse.tech/docs/en/engines/table-engines/mergetree-family/mergetree/#table_engine-mergetree-ttl
///
/// 设置了ttl但是数据并没有删除原因可能如下:
/// 1.数据的ttl是在主键合并阶段执行的,如果part迟迟没有进行主键合并,过去数据就不会被删除
/// 解决方法:
/// 1.手动optimize final或者optimize [partition]触发merge任务;
/// 2.在建表时设置merge_with_ttl_timeout、ttl_only_drop_parts等参数,提高含有过期数据
/// parts的合并频率。
/// 2.表的ttl经过修改或者添加,存量的parts里缺少ttl信息或者不正确,也可能导致过期数据不会被删除
/// 解决方法:
/// 1.可以通过alter table materialize ttl命令重新生成TTL信息;
/// 2.可以通过optimize [partition]更新TTL信息。
for (const auto & part : parts)
{
new_data_part->ttl_infos.update(part->ttl_infos);
if (metadata_snapshot->hasAnyTTL() && !part->checkAllTTLCalculated(metadata_snapshot))
{
LOG_INFO(log, "Some TTL values were not calculated for part {}. Will calculate them forcefully during merge.", part->name);
need_remove_expired_values = true;
force_ttl = true;
}
}
......
/// 7.选择merge算法(Horizontal和Vertical),Horizontal是主要的算法,Vertical的条件比较苛刻
/// 选择Horizontal的场景:
/// 1.需要去重
/// 2.enable_vertical_merge_algorithm设置为0(未开启)
/// 3.需要移除过期的数据
/// 4.该part不支持VerticalMerge,一般的MergeTree都是支持的
/// 选择Vertical的场景:
/// 1.part要支持VerticalMerge,merging_params.mode:Ordinary、Collapsing、Replacing、VersionedCollapsing
/// 2.gathering_column个数>=vertical_merge_algorithm_min_columns_to_activate(默认11)
/// 3.所有part的总行数>=vertical_merge_algorithm_min_rows_to_activate(默认16*8192)
/// 4.part的个数<=MAX_PARTS(0x7F)
/// 以上条件必须全部满足才可以使用Vertical算法,否则还是使用Horizontal算法
MergeAlgorithm chosen_merge_algorithm = chooseMergeAlgorithm(parts, sum_input_rows_upper_bound, gathering_columns, deduplicate, need_remove_expired_values);
merge_entry->merge_algorithm.store(chosen_merge_algorithm, std::memory_order_relaxed);
LOG_DEBUG(log, "Selected MergeAlgorithm: {}", toString(chosen_merge_algorithm));
/// 8.获取part的压缩算法
auto compression_codec = data.getCompressionCodecForPart(merge_entry->total_size_bytes_compressed, new_data_part->ttl_infos, time_of_merge);
auto tmp_disk = context.getTemporaryVolume()->getDisk();
String rows_sources_file_path;
std::unique_ptr<WriteBufferFromFileBase> rows_sources_uncompressed_write_buf;
std::unique_ptr<WriteBuffer> rows_sources_write_buf;
std::optional<ColumnSizeEstimator> column_sizes;
SyncGuardPtr sync_guard;
/// 9.是否是使用Vertical算法
/// 当使用Vertical算法时:
/// 1.relative_data_path下的tmp_merge_{future_part_name}目录下生成rows_sources文件
/// 2.计算column_sizes,即merging_column和gathering_column所有列所占空见的总大小(压缩)
/// 3.判断数据落盘是否是fsync模式,和fsync_part_directory参数有关,默认false
/// 当不使用Vertical算法时:
/// 1.把merging_columns设置成所有的物理列
/// 2.把gathering_columns清空
if (chosen_merge_algorithm == MergeAlgorithm::Vertical)
{
tmp_disk->createDirectories(new_part_tmp_path);
rows_sources_file_path = new_part_tmp_path + "rows_sources";
rows_sources_uncompressed_write_buf = tmp_disk->writeFile(rows_sources_file_path);
rows_sources_write_buf = std::make_unique<CompressedWriteBuffer>(*rows_sources_uncompressed_write_buf);
MergeTreeData::DataPart::ColumnToSize merged_column_to_size;
for (const MergeTreeData::DataPartPtr & part : parts)
part->accumulateColumnSizes(merged_column_to_size);
column_sizes = ColumnSizeEstimator(merged_column_to_size, merging_column_names, gathering_column_names);
if (data.getSettings()->fsync_part_directory)
sync_guard = disk->getDirectorySyncGuard(new_part_tmp_path);
}
else
{
merging_columns = storage_columns;
merging_column_names = all_column_names;
gathering_columns.clear();
gathering_column_names.clear();
}
/** Read from all parts, merge and write into a new one.
* In passing, we calculate expression for sorting.
*/
Pipes pipes;
UInt64 watch_prev_elapsed = 0;
/// We count total amount of bytes in parts
/// and use direct_io + aio if there is more than min_merge_bytes_to_use_direct_io
/// 10.根据min_merge_bytes_to_use_direct_io判断是否使用direct_io + aio
bool read_with_direct_io = false;
if (data_settings->min_merge_bytes_to_use_direct_io != 0)
{
size_t total_size = 0;
for (const auto & part : parts)
{
total_size += part->getBytesOnDisk();
if (total_size >= data_settings->min_merge_bytes_to_use_direct_io)
{
LOG_DEBUG(log, "Will merge parts reading files in O_DIRECT");
read_with_direct_io = true;
break;
}
}
}
/// 11.初始化StageProgress
/// 判断column_sizes是否为真就是判断是否是Vertical算法,只有选择的是Vertical算法在上面的逻辑中
/// column_sizes才会被赋值,权重为merging列总数/(merging列总数+gathering列总数)
/// 若是Horizontal算法,则权重值为1
MergeStageProgress horizontal_stage_progress(
column_sizes ? column_sizes->keyColumnsWeight() : 1.0);
/// 12.将所有part构造input流加入pipe,并且在input流中添加callback方法,
/// 用于更新处理过程中的stage和耗时,最后把所有pipe加入总的pipes vector中
/// 如果有sort_key,还要在每个part的pipe中添加相对应的SimpleTransform
for (const auto & part : parts)
{
auto input = std::make_unique<MergeTreeSequentialSource>(
data, metadata_snapshot, part, merging_column_names, read_with_direct_io, true);
input->setProgressCallback(
MergeProgressCallback(merge_entry, watch_prev_elapsed, horizontal_stage_progress));
Pipe pipe(std::move(input));
if (metadata_snapshot->hasSortingKey())
{
pipe.addSimpleTransform([&metadata_snapshot](const Block & header)
{
return std::make_shared<ExpressionTransform>(header, metadata_snapshot->getSortingKey().expression);
});
}
pipes.emplace_back(std::move(pipe));
}
Names sort_columns = metadata_snapshot->getSortingKeyColumns();
SortDescription sort_description;
size_t sort_columns_size = sort_columns.size();
sort_description.reserve(sort_columns_size);
Names partition_key_columns = metadata_snapshot->getPartitionKey().column_names;
Block header = pipes.at(0).getHeader();
for (size_t i = 0; i < sort_columns_size; ++i)
sort_description.emplace_back(header.getPositionByName(sort_columns[i]), 1, 1);
/// The order of the streams is important: when the key is matched, the elements go in the order of the source stream number.
/// In the merged part, the lines with the same key must be in the ascending order of the identifier of original part,
/// that is going in insertion order.
ProcessorPtr merged_transform;
/// If merge is vertical we cannot calculate it
bool blocks_are_granules_size = (chosen_merge_algorithm == MergeAlgorithm::Vertical);
/// 13.通过merging_params.mode决定merged_transform的类型
UInt64 merge_block_size = data_settings->merge_max_block_size;
switch (data.merging_params.mode)
{
case MergeTreeData::MergingParams::Ordinary:
merged_transform = std::make_unique<MergingSortedTransform>(
header, pipes.size(), sort_description, merge_block_size, 0, rows_sources_write_buf.get(), true, blocks_are_granules_size);
break;
......
}
/// 14.构造QueryPipeline,并将各个stream添加到QueryPipeline
QueryPipeline pipeline;
pipeline.init(Pipe::unitePipes(std::move(pipes)));
pipeline.addTransform(std::move(merged_transform));
pipeline.setMaxThreads(1);
BlockInputStreamPtr merged_stream = std::make_shared<PipelineExecutingBlockInputStream>(std::move(pipeline));
if (deduplicate)
merged_stream = std::make_shared<DistinctSortedBlockInputStream>(merged_stream, sort_description, SizeLimits(), 0 /*limit_hint*/, deduplicate_by_columns);
if (need_remove_expired_values)
merged_stream = std::make_shared<TTLBlockInputStream>(merged_stream, data, metadata_snapshot, new_data_part, time_of_merge, force_ttl);
if (metadata_snapshot->hasSecondaryIndices())
{
const auto & indices = metadata_snapshot->getSecondaryIndices();
merged_stream = std::make_shared<ExpressionBlockInputStream>(
merged_stream, indices.getSingleExpressionForIndices(metadata_snapshot->getColumns(), data.global_context));
merged_stream = std::make_shared<MaterializingBlockInputStream>(merged_stream);
}
const auto & index_factory = MergeTreeIndexFactory::instance();
MergedBlockOutputStream to{
new_data_part,
metadata_snapshot,
merging_columns,
index_factory.getMany(metadata_snapshot->getSecondaryIndices()),
compression_codec,
blocks_are_granules_size};
/// 15.为每个stream创建Executor
merged_stream->readPrefix();
to.writePrefix();
size_t rows_written = 0;
const size_t initial_reservation = space_reservation ? space_reservation->getSize() : 0;
auto is_cancelled = [&]() { return merges_blocker.isCancelled()
|| (need_remove_expired_values && ttl_merges_blocker.isCancelled()); };
/// 16.以block为单位读取数据
/// 触发各个stream的executor的pull方法驱动数据读取和处理
Block block;
while (!is_cancelled() && (block = merged_stream->read()))
{
rows_written += block.rows();
to.write(block);
merge_entry->rows_written = merged_stream->getProfileInfo().rows;
merge_entry->bytes_written_uncompressed = merged_stream->getProfileInfo().bytes;
/// Reservation updates is not performed yet, during the merge it may lead to higher free space requirements
if (space_reservation && sum_input_rows_upper_bound)
{
/// The same progress from merge_entry could be used for both algorithms (it should be more accurate)
/// But now we are using inaccurate row-based estimation in Horizontal case for backward compatibility
Float64 progress = (chosen_merge_algorithm == MergeAlgorithm::Horizontal)
? std::min(1., 1. * rows_written / sum_input_rows_upper_bound)
: std::min(1., merge_entry->progress.load(std::memory_order_relaxed));
space_reservation->update(static_cast<size_t>((1. - progress) * initial_reservation));
}
}
/// 17.记录上述操作的具体耗时情况
merged_stream->readSuffix();
merged_stream.reset();
if (merges_blocker.isCancelled())
throw Exception("Cancelled merging parts", ErrorCodes::ABORTED);
if (need_remove_expired_values && ttl_merges_blocker.isCancelled())
throw Exception("Cancelled merging parts with expired TTL", ErrorCodes::ABORTED);
bool need_sync = needSyncPart(sum_input_rows_upper_bound, sum_compressed_bytes_upper_bound, *data_settings);
MergeTreeData::DataPart::Checksums checksums_gathered_columns;
/// Gather ordinary columns
/// 18.处理gathering_columns的merge,主要是通过merging_column过程中产生的rows_sources文件
/// 进行gathering_columns中每一列的分别merge,然后清除rows_sources文件
/// rows_sources文件中记录的是
///
if (chosen_merge_algorithm == MergeAlgorithm::Vertical)
{
......
for (size_t column_num = 0, gathering_column_names_size = gathering_column_names.size();
column_num < gathering_column_names_size;
++column_num, ++it_name_and_type)
{
const String & column_name = it_name_and_type->name;
Names column_names{column_name};
Float64 progress_before = merge_entry->progress.load(std::memory_order_relaxed);
MergeStageProgress column_progress(progress_before, column_sizes->columnWeight(column_name));
for (size_t part_num = 0; part_num < parts.size(); ++part_num)
{
auto column_part_source = std::make_shared<MergeTreeSequentialSource>(
data, metadata_snapshot, parts[part_num], column_names, read_with_direct_io, true);
column_part_source->setProgressCallback(
MergeProgressCallback(merge_entry, watch_prev_elapsed, column_progress));
QueryPipeline column_part_pipeline;
column_part_pipeline.init(Pipe(std::move(column_part_source)));
column_part_pipeline.setMaxThreads(1);
column_part_streams[part_num] =
std::make_shared<PipelineExecutingBlockInputStream>(std::move(column_part_pipeline));
}
rows_sources_read_buf.seek(0, 0);
ColumnGathererStream column_gathered_stream(column_name, column_part_streams, rows_sources_read_buf);
MergedColumnOnlyOutputStream column_to(
new_data_part,
metadata_snapshot,
column_gathered_stream.getHeader(),
compression_codec,
/// we don't need to recalc indices here
/// because all of them were already recalculated and written
/// as key part of vertical merge
std::vector<MergeTreeIndexPtr>{},
&written_offset_columns,
to.getIndexGranularity());
size_t column_elems_written = 0;
column_to.writePrefix();
while (!merges_blocker.isCancelled() && (block = column_gathered_stream.read()))
{
column_elems_written += block.rows();
column_to.write(block);
}
if (merges_blocker.isCancelled())
throw Exception("Cancelled merging parts", ErrorCodes::ABORTED);
column_gathered_stream.readSuffix();
auto changed_checksums = column_to.writeSuffixAndGetChecksums(new_data_part, checksums_gathered_columns, need_sync);
checksums_gathered_columns.add(std::move(changed_checksums));
if (rows_written != column_elems_written)
{
throw Exception("Written " + toString(column_elems_written) + " elements of column " + column_name +
", but " + toString(rows_written) + " rows of PK columns", ErrorCodes::LOGICAL_ERROR);
}
/// NOTE: 'progress' is modified by single thread, but it may be concurrently read from MergeListElement::getInfo() (StorageSystemMerges).
merge_entry->columns_written += 1;
merge_entry->bytes_written_uncompressed += column_gathered_stream.getProfileInfo().bytes;
merge_entry->progress.store(progress_before + column_sizes->columnWeight(column_name), std::memory_order_relaxed);
}
tmp_disk->removeFile(rows_sources_file_path);
}
for (const auto & part : parts)
new_data_part->minmax_idx.merge(part->minmax_idx);
/// 19.打印所有相关的
{
double elapsed_seconds = merge_entry->watch.elapsedSeconds();
LOG_DEBUG(log,
"Merge sorted {} rows, containing {} columns ({} merged, {} gathered) in {} sec., {} rows/sec., {}/sec.",
merge_entry->rows_read,
all_column_names.size(),
merging_column_names.size(),
gathering_column_names.size(),
elapsed_seconds,
merge_entry->rows_read / elapsed_seconds,
ReadableSize(merge_entry->bytes_read_uncompressed / elapsed_seconds));
}
/// 20.后续收尾工作
/// 主要是更新新part等一些内存信息,使checksum、count.txt、ttl.txt等文件落盘等工作
if (chosen_merge_algorithm != MergeAlgorithm::Vertical)
to.writeSuffixAndFinalizePart(new_data_part, need_sync);
else
to.writeSuffixAndFinalizePart(new_data_part, need_sync, &storage_columns, &checksums_gathered_columns);
return new_data_part;
}
rows_sources这个文件记录的是block的序号标识,rows_sources文件是在clickhouse数据根目录的tmp下产生的,而且只有在Vertical算法下才会产生,记录在merge过程中记录block排序后的序号标识。
Horizontal算法是所有列都会被读出参与merge,不会产生rows_sources文件,若列非常多,读出的数据就多,参与计算的列也就多,会对内存和CPU都有很大的消耗;
Vertical算法是先读出必要的列(pk、order by、index)参与merge,并在merge的过程中将排序好的block的序号标识暂存到rows_sources文件,后面一些普通列就可以直接根据rows_sources文件中顺序直接将数据直接合并起来即可,免去了排序的过程。这样可以使参与排序的列变少,在必要列的merge过程中就会减少内存和CPU的消耗,而且在普通列的merge过程实际就变为了读取指定顺序的block的合并过程,可以逐列依次计算,不需要所有列都同时参与计算。
916

被折叠的 条评论
为什么被折叠?



