OLAP在线联合分析数据库之ClickHouse之入门及分区操作_clickhouse olap 如何关联分析-CSDN博客

本文链接：https://blog.csdn.net/Exception_sir/article/details/118677578

Clickhouse基本操作

基本操作语句
DDL操作
DML操作
CK分区操作

基本操作语句

建库语句

假如库名为 test_db

show databases;
create database if not exists test_db;
# 转换数据库
use test_db;
# 查询当前使用的数据库
select currentDatabse();
# 删除数据库
drop database test_db;

连接数据库时报错

clickhouse-client -h xxx01 报错“拒绝连接”，需要做以下处理：

vi /etc/clickhouse-server/config.xml
# 修改配置文件开启允许外部客户端连接
<listen_host>::</listen_host>
# 重启clickhouse
service clickhouse-server retsart 
#再去使用上次操作

clickhouse交互查询

clickhouse-client -q '执行一条sql';
# -n/multiquery
clickhouse-client -n -q '执行多条查询语句语句之间使用；隔开';
# -m 可以换行执行多条语句
clickhouse-client -m;

ck数据类型

先了解一下基本的ck建表语句：

# 一定要指定引擎
# clickhouse里面的关键字对大小写敏感
create table test1(
	age Int8,
	name String
)engine=Memory;

表结构查询：

desc test1;

基本的数据结构
请参考https://clickhouse.tech/docs/en/sql-reference/data-types
UUID的使用

CREATE table tb_test1_uuid(
	id UUID,
	age UInt8,
	name String,
	job FixedString(8),
	salary Float64,
	score Decimal(5,2)

)engine=Memory;

DESC tb_test1_uuid;
SELECT generateUUIDv4();
INSERT into tb_test1_uuid (age,name,job,salary,score) values(12,'dhj','程序',1000.99,100);
SELECT * FROM tb_test1_uuid ttu ;
INSERT into tb_test1_uuid (id,age,name,job,salary,score) values(generateUUIDv4(),28,'dhj','java',1000.99,100);
INSERT into tb_test1_uuid  values(generateUUIDv4(),33,'正三','c++',1600.99,66);

Enum枚举类型

-- 创建一个含有枚举类型的(虽然插入的是字符串，但是实际底层存储的是数字，节省存储空间占用)
create table test_enum(
	id Int8,
	gender Enum('男'=1,'女'=2)
)engine=Memory;

desc test_enum;
INSERT into test_enum values (1,'男');
INSERT into test_enum values (3,2);
select * from test_enum ;

Array数组类型

-- 创建一个带数组的语句；
select [1,2,3];
SELECT array('a','b','c');
CREATE table test_array(
	hoby Array(String)
)engine=Memory;

INSERT into test_array values(['aa','bb','cc']),(array('AA','BB','CC'));

SELECT * FROM test_array ;

select hoby,toTypeName(hoby) from test_array ;

SELECT hoby[1] FROM test_array ; --对应数组的下标为0，ck中是从1开始；

Date时间类型

-- 创建时间类型
CREATE table test_date(
	id Int16,
	birthday Date, -- 精确到天
	createtime DateTime, -- 精确到时分秒
	createtime2 DateTime64, --精确到亚秒，不指定亚秒时间位数，默认为3
	createtime3 DateTime64(9)
)engine=Memory;

INSERT into test_date values (1,now(),now(),now(),now());

select * from test_date ;

Tuple元组类型

--Tuple 元组
CREATE table test_tuple (
	id UUID,
	tt Tuple(String,Int8,Float32)
)engine=Memory;

INSERT into test_tuple values (generateUUIDv4(),('张三',12,9.08)),(generateUUIDv4(),tuple('王五',23,10.08));

SELECT * FROM  test_tuple tt ;

Nested嵌入表结构类型

-- nested 嵌入表数据类型
DROP table if exists test_nested;
create table test_nested(
	id Int8,
	name String,
	course Nested(
		course_num String,
		course_name String,
		score Float64
	)

)engine=Memory;

desc test_nested;
-- 注意嵌套的表中每行数组中的长度是一致的，但是行与行之间的嵌套表中的数组长度可以不保持一致
INSERT into test_nested values (1,'邓怀俊',['01','02','03'],['语文','数学','英语'],[99.5,100,88]);

--查询学员及对应的课程名称和分数
select name,course.course_name,course.score from test_nested;
--查询学员及对应的第一门课程名称和分数
select name,course.course_name[1],course.score[1] from test_nested;

IPv4类型

-- IPv4对于IP地址默认存在校验功能
CREATE table test_ipv4(id Int32,ip IPv4)engine=Memory;

desc test_ipv4;

INSERT into test_ipv4 values(1,'10.10.10.10');

select ip,toTypeName(ip) as colType from test_ipv4 ;

Nullable和Boolean类型

-- Boolean和Nullable类型
--ck中没有布尔类型，只有使用0和1代表false和true，Nullable某种数据类型允许为null,或者没有给值的情况下模式是null

CREATE table test_nullable(id Int32,age UInt32)engine=Memory;

--对于int类型若不给值，则默认插入与数据类型一致的数值
INSERT into test_nullable(id) values (22); 

SELECT * from test_nullable ;

DROP table if exists test_nullable;
CREATE table test_nullable(id Int32,age Nullable(UInt32))engine=Memory;

--再次插入数据，若age不给值则默认插入null;
INSERT into test_nullable(id) values (22);

DDL操作

建表

目前只有MergeTree、Merge、Distributed这三类表引擎支持Alter查询，所以在进行Alter表操作时，注意表的引擎，而且在建表的时候一定要指定引擎。

-- 建表
-- 目前只有MergeTree、Merge、Distributed这三类表引擎支持Alter查询，所以在进行Alter表操作时，注意表的引擎，而且在建表的时候一定要指定引擎。
drop table if exists test_engine01;
create table test_engine01(
	id UUID,
	name String
)engine=Memory();

desc test_engine01;

--现在的引擎为内存引擎，我们执行修改表操作
ALTER table test_engine01 add column age UInt32;
-- 执行以上DDL语句发现报错"Alter of type 'ADD COLUMN' is not supported by storage Memory "--于是我们新建一个MergeTree引擎的表
-- MergeTree引擎在建表时一定要指定主键和排序字段，若主键和排序字段相同，直接使用order by 即可。
create table test_engine02(
	id UUID,
	name String
)engine=MergeTree() order by id;

--再次执行修改表结构语句，即成功
ALTER table test_engine02 add column age UInt32;
SELECT * FROM  test_engine02;
--查看建表语句及引擎和参数设置
show create table test_engine02;

-- 新建一个带注释有默认值的表
create table test_engine03(
	id Int32 comment 'id',
	name String comment '姓名',
	birthday DateTime comment '生日',
	age UInt32
	
)engine=MergeTree()
order by (id,birthday);

show CREATE table test_engine03;

--返回结果：CREATE TABLE test_db.test_engine02 (`id` UUID, `name` String, `age` UInt32) ENGINE = MergeTree() ORDER BY id SETTINGS index_granularity = 8192 默认索引粒度8192；

修改表

-- 修改表结构
ALTER table test_engine02 add column age UInt32;  --默认新增的字段在原有的字段后面
desc test_engine02;
-- 如果我想加一个字段在原有某个字段后面，如加个sex字段在name后面，可以使用以下语句
ALTER table test_engine02 add column sex Int8 after name;
desc test_engine02;
-- 删除列
ALTER table test_engine02 drop column sex;
desc test_engine02;
-- 修改列的字段类型
ALTER table test_engine02 modify column sex String default '男';
desc test_engine02;
-- 给表中某一列添加注释,内部默认使用默认编码为UTF-8
ALTER table test_engine02 comment column name '姓名';

重命名表及移动表

--修改表名
rename table tb_test1_uuid to tb_test1;
--修改多张表名
rename table test_engine01 to test_create_engine01,test_engine02 to test_create_engine02;
--移动表到另一数据库中并重命名
rename table default.test1 to test_db.default_test1;
-- 查看数据库中所有表
show tables;
show tables from db_name;

设置表的属性

-- 设置列的默认值
DROP table if exists tb_test2;
create table tb_test2(
	id Int8,
	name String,
	role String  default 'vip' comment '角色'
)engine=Log();

desc tb_test2;

INSERT into tb_test2 (id,name)values(1,'李四');

SELECT * from tb_test2;

DML操作

数据导入方式

-- 创建一个表
create table test_export01(
	id Int8,
	name String,
	age Int8,
	sex String 
)engine=Log();

-- 方式1：通过insert into的方式
INSERT  into test_export01 values (1,'aa',21,'男'),(2,'bb',32,'女');

-- 方式2：复制表的方式
CREATE table test_export02 as test_export01;
--将test_export01的数据复制到02中
insert into test_export02 select * from test_export01 ;

-- 方式3：通过csv文件方式导入
CREATE table test_export03 as test_export01;
-- 使用客户端命令前提txt文件字段是使用逗号分隔
-- cat test.txt | clickhouse-client -q 'insert into test_db.test_export03 format CSV'; 
-- clickhouse-client -q 'insert into test_db.test_export03 format CSV' < test.txt ;
-- 但是以上俩种方式默认文本中分隔符为逗号，若为其它分隔符，则使用以下语句：
-- clickhouse-client --format_csv_delimiter='分隔符' -q 'insert into test_db.test_export03 format CSV' < test.txt ;
-- 若使用sql批量执行导入：
-- clickhouse-client  --multiquery <  /home/clickhouse一键建表脚本.sql
SELECT * FROM test_export03;

数据的更新和删除
只有MergeTree引擎的数据才支持修改操作！

create table test_update(
	id Int8,
	name String,
	age Int8,
	sex String 
)engine=MergeTree() order by id;

INSERT into test_update select * from test_export03 ;

--更新
ALTER table test_update update name='ZZ' where id=1;
--删除
ALTER table test_update delete where id=1;

select * from test_update;

CK分区操作

新建分区

-- 目前只有MergeTree系列的表引擎支持数据分区，可以支持区内排序、去重、合并

-- 分区表的创建--以创建时间的年月日分区
create table test_partition(
id Int32, 
name String, 
createTime DateTime
)engine = MergeTree() 
partition by toDate(createTime)
order by id;

--查看创建分区表的语句
show CREATE table test_partition ;
select * from test_partition;
-- 可以看出不同时间段的分区在终端查询的分区展示：
┌─id─┬─name─┬──────────createTime─┐
│  4 │ aa   │ 2021-07-08 12:12:12 │
│  5 │ bb   │ 2021-07-08 23:22:22 │
└────┴──────┴─────────────────────┘
┌─id─┬─name─┬──────────createTime─┐
│  3 │ cc   │ 2021-07-01 11:11:11 │
└────┴──────┴─────────────────────┘
┌─id─┬─name─┬──────────createTime─┐
│  1 │ aa   │ 2021-02-08 12:12:12 │
│  2 │ bb   │ 2021-02-08 02:22:22 │
└────┴──────┴─────────────────────┘

# 我们查看数据存放目录，可以看见不同的日期区间的数据存放在不同的目录：
root@bb8399987902:/var/lib/clickhouse/data/test_db/test_partition# pwd
/var/lib/clickhouse/data/test_db/test_partition
root@bb8399987902:/var/lib/clickhouse/data/test_db/test_partition# ll
total 4
drwxr-x---  6 clickhouse clickhouse 138 Jul 27 15:05 ./
drwxr-x--- 10 clickhouse clickhouse 211 Jul 27 14:59 ../
drwxr-x---  2 clickhouse clickhouse 291 Jul 27 15:05 20210208_1_1_0/
drwxr-x---  2 clickhouse clickhouse 291 Jul 27 15:05 20210701_2_2_0/
drwxr-x---  2 clickhouse clickhouse 291 Jul 27 15:05 20210708_3_3_0/
drwxr-x---  2 clickhouse clickhouse   6 Jul 27 14:59 detached/
-rw-r-----  1 clickhouse clickhouse   1 Jul 27 14:59 format_version.txt
root@bb8399987902:/var/lib/clickhouse/data/test_db/test_partition#

我们也可以使用另外的方式查看表的分区情况

show databases;
use system;
select table,name,partition from parts where table='表名';

# 结果如下
SELECT 
    table, 
    name, 
    partition
FROM parts
WHERE table = 'test_partition'

┌─table──────────┬─name───────────┬─partition──┐
│ test_partition │ 20210208_1_1_0 │ 2021-02-08 │
│ test_partition │ 20210701_2_2_0 │ 2021-07-01 │
│ test_partition │ 20210708_3_3_0 │ 2021-07-08 │
└────────────────┴────────────────┴────────────┘

删除分区

-- 创建使用城市的分区表
create table test_partition_city(
id Int32, 
name String, 
createTime DateTime,
city String
)engine = MergeTree() 
partition by (city)
order by id;

INSERT into test_partition_city values (1,'aa','2021-02-08 12:12:12','河南'),(2,'bb','2021-02-08 02:22:22','北京'),(3,'cc','2021-07-01 11:11:11','河南'),(4,'aa','2021-07-08 12:12:12','山东'),(5,'bb','2021-07-08 23:22:22','郑州');

SELECT * from test_partition_city ;

-- 删除分区数据，比如删除河南的分区数据
ALTER table test_partition_city drop partition '河南';
SELECT  * FROM  test_partition_city ;

合并分区

# 如上诉表 test_partition_city表我们在次新增一条数据
insert into test_partition_city values(1,'小米',now(),'郑州');
# 执行插入语句之后我们查看客户端查询语句查看结果会发现新增的数据是一种追加的形式，并没有进行分区合并。由于是合并树的引擎，所以数据在新增的时候都是按块追加的形式。

SELECT *
FROM test_partition_city

┌─id─┬─name─┬──────────createTime─┬─city─┐
│  5 │ bb   │ 2021-07-08 23:22:22 │ 郑州 │
└────┴──────┴─────────────────────┴──────┘
┌─id─┬─name─┬──────────createTime─┬─city─┐
│  2 │ bb   │ 2021-02-08 02:22:22 │ 北京 │
└────┴──────┴─────────────────────┴──────┘
┌─id─┬─name─┬──────────createTime─┬─city─┐
│  4 │ aa   │ 2021-07-08 12:12:12 │ 山东 │
└────┴──────┴─────────────────────┴──────┘
┌─id─┬─name─┬──────────createTime─┬─city─┐
│  1 │ 小米 │ 2021-07-27 15:37:09 │ 郑州 │
└────┴──────┴─────────────────────┴──────┘
# 再次执行
optimize table test_partition_city;  -- 分区合并
-- 结果如下：
SELECT *
FROM test_partition_city

┌─id─┬─name─┬──────────createTime─┬─city─┐
│  2 │ bb   │ 2021-02-08 02:22:22 │ 北京 │
└────┴──────┴─────────────────────┴──────┘
┌─id─┬─name─┬──────────createTime─┬─city─┐
│  4 │ aa   │ 2021-07-08 12:12:12 │ 山东 │
└────┴──────┴─────────────────────┴──────┘
┌─id─┬─name─┬──────────createTime─┬─city─┐
│  1 │ 小米 │ 2021-07-27 15:37:09 │ 郑州 │
│  5 │ bb   │ 2021-07-08 23:22:22 │ 郑州 │
└────┴──────┴─────────────────────┴──────┘

备份分区数据/复制分区数据

此前提条件备份表的分区键和原表一致，且表结构一致，一般使用在快速备份数据

-- 将test_partition_city 分区表中city='郑州'分区的数据复制到备份表中test_partion_city_bak
-- 新建备份表
CREATE table test_partion_city_bak as test_partition_city;

show CREATE table test_partion_city_bak;

-- 备份数据
ALTER table test_partion_city_bak replace partition '郑州' from  test_partition_city;

select * from test_partion_city_bak;

装载分区/卸载分区

卸载分实际并没有删除分区，而是将分区数据脱离cliclhouse管理，并移入当前数据目录下面的detached目录里面，你可以将其完全删除或者重新加载

# 卸载分区语句
bb8399987902 :) alter table test_partition_city detach partition '北京';
# 装载分区语句
bb8399987902 :) alter table test_partition_city attach partition '北京';