MergeTree引擎
一.MergeTree引擎
MergeTree系列的表引擎是clickhouse数据存储功能的核心,它们提供了用于弹性和高性能数据检索的大多数功能:列存储,自定义分区,稀疏的主索引,辅助数据跳过索引等
主要特点:
- 存储按主键排序的数据(必须有order by(主键,…))
- 如果指定了分区键,则可以使用分区
- 数据复制支持
- 数据采样支持
create table tb_merge_tree(
uid UInt8,
name String,
birthday Date,
city String,
gender String
)engine=MergeTree()
primary key uid
order by (uid,birthday);
insert into tb_merge_tree values
(1,'zs',toDate(now()),'HZ','M'),
(2,'ls','2021-09-27','HZ','M'),
(3,'ww','2021-08-18','SH','F'),
(4,'zl','1999-09-27','JS','F');
select * from tb_merge_tree;
┌─uid─┬─name─┬───birthday─┬─city─┬─gender─┐
│ 1 │ zs │ 2021-08-30 │ HZ │ M │
│ 2 │ ls │ 2021-09-27 │ HZ │ M │
│ 3 │ ww │ 2021-08-18 │ SH │ F │
│ 4 │ zl │ 1999-09-27 │ JS │ F │
└─────┴──────┴────────────┴──────┴────────┘
**看一下底层存储**
[root@spark-140 test1]# cd tb_merge_tree/
[root@spark-140 tb_merge_tree]# ll
total 12
drwxr-x--- 2 root root 4096 Aug 30 17:18 all_1_1_0 //存着表的所有数据
drwxr-x--- 2 root root 4096 Aug 30 17:18 detached
-rw-r----- 1 root root 1 Aug 30 17:18 format_version.txt
[root@spark-140 tb_merge_tree]# cd all_1_1_0/
[root@spark-140 all_1_1_0]# ll
total 28
-rw-r----- 1 root root 187 Aug 30 17:18 checksums.txt //数据安全校验
-rw-r----- 1 root root 109 Aug 30 17:18 columns.txt //列信息文件,存放列字段信息
-rw-r----- 1 root root 1 Aug 30 17:18 count.txt //记录当前数据的条数
-rw-r----- 1 root root 174 Aug 30 17:18 data.bin //数据文件
-rw-r----- 1 root root 176 Aug 30 17:18 data.mrk3 //列字段标记文件
-rw-r----- 1 root root 10 Aug 30 17:18 default_compression_codec.txt
-rw-r----- 1 root root 2 Aug 30 17:18 primary.idx //一级索引文件,用于存放稀疏索引
[root@spark-140 all_1_1_0]# cat count.txt4
insert into tb_merge_tree values
(1,'zs',toDate(now()),'HZ','M'),
(2,'ls','2021-09-27','HZ','M'),
(3,'ww','2021-08-18','SH','F'),
(4,'zl','1999-09-27','JS','F'); //再次执行一次插入操作
select * from tb_merge_tree;
┌─uid─┬─name─┬───birthday─┬─city─┬─gender─┐
│ 1 │ zs │ 2021-08-30 │ HZ │ M │
│ 2 │ ls │ 2021-09-27 │ HZ │ M │
│ 3 │ ww │ 2021-08-18 │ SH │ F │
│ 4 │ zl │ 1999-09-27 │ JS │ F │
└─────┴──────┴────────────┴──────┴────────┘
┌─uid─┬─name─┬───birthday─┬─city─┬─gender─┐
│ 1 │ zs │ 2021-08-30 │ HZ │ M │
│ 2 │ ls │ 2021-09-27 │ HZ │ M │
│ 3 │ ww │ 2021-08-18 │ SH │ F │
│ 4 │ zl │ 1999-09-27 │ JS │ F │
└─────┴──────┴────────────┴──────┴────────┘
//主键没有唯一要求,是可以重复的
[root@spark-140 tb_merge_tree]# ll
total 16
drwxr-x--- 2 root root 4096 Aug 30 17:18 all_1_1_0
drwxr-x--- 2 root root 4096 Aug 30 17:26 all_2_2_0 //第二次插入操作的文件
drwxr-x--- 2 root root 4096 Aug 30 17:18 detached
-rw-r----- 1 root root 1 Aug 30 17:18 format_version.txt
[root@spark-140 tb_merge_tree]# cd all_2_2_0/
[root@spark-140 all_2_2_0]# ll
total 28
-rw-r----- 1 root root 187 Aug 30 17:26 checksums.txt
-rw-r----- 1 root root 109 Aug 30 17:26 columns.txt
-rw-r----- 1 root root 1 Aug 30 17:26 count.txt
-rw-r----- 1 root root 174 Aug 30 17:26 data.bin
-rw-r----- 1 root root 176 Aug 30 17:26 data.mrk3
-rw-r----- 1 root root 10 Aug 30 17:26 default_compression_codec.txt
-rw-r----- 1 root root 2 Aug 30 17:26 primary.idx
[root@spark-140 all_2_2_0]# cat count.txt
4
**合并这两个数据块**
optimize table tb_merge_tree [final]; //final关键字是强制合并的意思,用于多个表合并时
select * from tb_merge_tree;
┌─uid─┬─name─┬───birthday─┬─city─┬─gender─┐
│ 1 │ zs │ 2021-08-30 │ HZ │ M │
│ 1 │ zs │ 2021-08-30 │ HZ │ M │
│ 2 │ ls │ 2021-09-27 │ HZ │ M │
│ 2 │ ls │ 2021-09-27 │ HZ │ M │
│ 3 │ ww │ 2021-08-18 │ SH │ F │
│ 3 │ ww │ 2021-08-18 │ SH │ F │
│ 4 │ zl │ 1999-09-27 │ JS │ F │
│ 4 │ zl │ 1999-09-27 │ JS │ F │
└─────┴──────┴────────────┴──────┴────────┘
这时再看一下底层存储
[root@spark-140 tb_merge_tree]# ll
total 20
drwxr-x--- 2 root root 4096 Aug 30 17:18 all_1_1_0
drwxr-x--- 2 root root 4096 Aug 30 17:31 all_1_2_1 //这是合并操作的文件
drwxr-x--- 2 root root 4096 Aug 30 17:26 all_2_2_0
drwxr-x--- 2 root root 4096 Aug 30 17:18 detached
-rw-r----- 1 root root 1 Aug 30 17:18 format_version.txt
[root@spark-140 tb_merge_tree]# cd all_1_2_1/
//all_1_2_1数字分别代表什么 1代表最低版本,2代表最高版本,1代表合并操作一次
[root@spark-140 all_1_2_1]# ll
total 28-rw-r----- 1 root root 187 Aug 30 17:31 checksums.txt
-rw-r----- 1 root root 109 Aug 30 17:31 columns.txt
-rw-r----- 1 root root 1 Aug 30 17:31 count.txt
-rw-r----- 1 root root 205 Aug 30 17:31 data.bin
-rw-r----- 1 root root 176 Aug 30 17:31 data.mrk3
-rw-r----- 1 root root 10 Aug 30 17:31 default_compression_codec.txt
-rw-r----- 1 root root 2 Aug 30 17:31 primary.idx
[root@spark-140 all_1_2_1]# cat count.txt
8
create table tb_merge_tree_partition(
uid UInt8,
name String,
birthday DateTime
)engine=MergeTree()
partition by toDate(birthday)
order by uid;
insert into tb_merge_tree_partition values
(1,'zs','2020-09-27 10:00:00'),
(2,'ls','2020-09-27 11:00:00'),
(3,'ww','2020-08-18 10:00:00'),
(4,'zl','2020-08-18 11:00:00');
select * from tb_merge_tree_partition;
┌─uid─┬─name─┬────────────birthday─┐
│ 1 │ zs │ 2020-09-27 10:00:00 │
│ 2 │ ls │ 2020-09-27 11:00:00 │
└─────┴──────┴─────────────────────┘
┌─uid─┬─name─┬────────────birthday─┐
│ 3 │ ww │ 2020-08-18 10:00:00 │
│ 4 │ zl │ 2020-08-18 11:00:00 │
└─────┴──────┴─────────────────────┘
insert into tb_merge_tree_partition values
(11,'zss','2020-09-27 10:00:00'),
(12,'lss','2020-09-27 11:00:00'),
(13,'www','2020-08-18 10:00:00'),
(14,'zll','2020-08-18 11:00:00');
select * from tb_merge_tree_partition;
┌─uid─┬─name─┬────────────birthday─┐
│ 1 │ zs │ 2020-09-27 10:00:00 │
│ 2 │ ls │ 2020-09-27 11:00:00 │
└─────┴──────┴─────────────────────┘
┌─uid─┬─name─┬────────────birthday─┐
│ 3 │ ww │ 2020-08-18 10:00:00 │
│ 4 │ zl │ 2020-08-18 11:00:00 │
└─────┴──────┴─────────────────────┘
┌─uid─┬─name─┬────────────birthday─┐
│ 11 │ zss │ 2020-09-27 10:00:00 │
│ 12 │ lss │ 2020-09-27 11:00:00 │
└─────┴──────┴─────────────────────┘
┌─uid─┬─name─┬────────────birthday─┐
│ 13 │ www │ 2020-08-18 10:00:00 │
│ 14 │ zll │ 2020-08-18 11:00:00 │
└─────┴──────┴─────────────────────┘
**看一下底层存储**
cd /data/clickhouse/data/test1/tb_merge_tree_partition
[root@spark-140 tb_merge_tree_partition]# ll
total 24
drwxr-x--- 2 root root 4096 Aug 30 19:58 20200818_2_2_0
drwxr-x--- 2 root root 4096 Aug 30 19:59 20200818_4_4_0
drwxr-x--- 2 root root 4096 Aug 30 19:58 20200927_1_1_0
drwxr-x--- 2 root root 4096 Aug 30 19:59 20200927_3_3_0
drwxr-x--- 2 root root 4096 Aug 30 19:58 detached
-rw-r----- 1 root root 1 Aug 30 19:58 format_version.txt
**合并操作**
optimize table tb_merge_tree_partition final;
select * from tb_merge_tree_partition;
┌─uid─┬─name─┬────────────birthday─┐
│ 1 │ zs │ 2020-09-27 10:00:00 │
│ 2 │ ls │ 2020-09-27 11:00:00 │
│ 11 │ zss │ 2020-09-27 10:00:00 │
│ 12 │ lss │ 2020-09-27 11:00:00 │
└─────┴──────┴─────────────────────┘
┌─uid─┬─name─┬────────────birthday─┐
│ 3 │ ww │ 2020-08-18 10:00:00 │
│ 4 │ zl │ 2020-08-18 11:00:00 │
│ 13 │ www │ 2020-08-18 10:00:00 │
│ 14 │ zll │ 2020-08-18 11:00:00 │
└─────┴──────┴─────────────────────┘
[root@spark-140 tb_merge_tree_partition]# ll
total 28
drwxr-x--- 2 root root 4096 Aug 30 19:58 20200818_2_2_0
drwxr-x--- 2 root root 4096 Aug 30 20:01 20200818_2_4_1
drwxr-x--- 2 root root 4096 Aug 30 19:59 20200818_4_4_0
drwxr-x--- 2 root root 4096 Aug 30 19:58 20200927_1_1_0
drwxr-x--- 2 root root 4096 Aug 30 20:02 20200927_1_3_1
drwxr-x--- 2 root root 4096 Aug 30 19:59 20200927_3_3_0
drwxr-x--- 2 root root 4096 Aug 30 19:58 detached
-rw-r----- 1 root root 1 Aug 30 19:58 format_version.txt