1. 建表
Clickhouse
-- clickhouse建表
-- 01 本地表样例
CREATE TABLE label_db.workload_local_v1 on cluster cowa_cluster
(
`timestamp` DateTime COMMENT '时间戳',
`action` String COMMENT '操作类型',
`user_id` UInt32 COMMENT '用户人id',
`uid` UInt32 COMMENT '操作人id',
`task_id` int COMMENT '任务id',
`project_id` int COMMENT '项目id',
`work_time` int COMMENT '工作时间',
`object_size` int COMMENT '新增对象数',
`confirmed_prelabel_size` int COMMENT '已确认或修改预标注对象数',
`prelabel_object_size` int COMMENT '预标注对象总数',
`review_size` int COMMENT '审核对象数',
`comment_size` int COMMENT '批注对象数',
`reviewed_size` int COMMENT '被审核对象数',
`commented_size` int COMMENT '被批注对象数',
`commit_label_1` int COMMENT '提交标注1',
`commit_review_1` int COMMENT '提交审核1',
`commit_review_2` int COMMENT '提交审核2',
`reject_review_1` int COMMENT '驳回审核1',
`reject_review_2` int COMMENT '驳回审核2',
`commit_reviewed_1` int COMMENT '被审核1审核',
`commit_reviewed_2` int COMMENT '被审核2审核',
`rejected_label_1` int COMMENT '标注1被驳回',
`rejected_review_1` int COMMENT '审核1倍驳回',
`dynamic_size` int COMMENT '新增动态对象数',
`static_size` int COMMENT '新增静态对象数',
`review_dynamic_size` int COMMENT '审核动态对象数',
`review_static_size` int COMMENT '审核静态对象数',
`is_rejected` int COMMENT '是否被驳回',
`_sign` UInt8,
`_ver` UInt32
) ENGINE = ReplicatedReplacingMergeTree(
'/clickhouse/tables/{shard}/workload_local_v1',
'{replica}',
_ver
) PARTITION BY toYYYYMM(timestamp)
ORDER BY (
uid,
project_id,
task_id,
timestamp
);
-- 02 分布式表样例
create table label_db.workload_v1 on cluster cowa_cluster
as label_db.workload_local_v1 ENGINE = Distributed('cowa_cluster', 'label_db', label_db.workload_local_v1,
cityHash64(user_id));
ByConity
-- 注意没有本地表分布式表的概念,直接创建
CREATE TABLE label_db.workload_v1
(
`timestamp` DateTime COMMENT '时间戳',
`action` String COMMENT '操作类型',
`user_id` UInt32 COMMENT '用户人id',
`uid` UInt32 COMMENT '操作人id',
`task_id` int COMMENT '任务id',
`project_id` int COMMENT '项目id',
`work_time` int COMMENT '工作时间',
`object_size` int COMMENT '新增对象数',
`confirmed_prelabel_size` int COMMENT '已确认或修改预标注对象数',
`prelabel_object_size` int COMMENT '预标注对象总数',
`review_size` int COMMENT '审核对象数',
`comment_size` int COMMENT '批注对象数',
`reviewed_size` int COMMENT '被审核对象数',
`commented_size` int COMMENT '被批注对象数',
`commit_label_1` int COMMENT '提交标注1',
`commit_review_1` int COMMENT '提交审核1',
`commit_review_2` int COMMENT '提交审核2',
`reject_review_1` int COMMENT '驳回审核1',
`reject_review_2` int COMMENT '驳回审核2',
`commit_reviewed_1` int COMMENT '被审核1审核',
`commit_reviewed_2` int COMMENT '被审核2审核',
`rejected_label_1` int COMMENT '标注1被驳回',
`rejected_review_1` int COMMENT '审核1倍驳回',
`dynamic_size` int COMMENT '新增动态对象数',
`static_size` int COMMENT '新增静态对象数',
`review_dynamic_size` int COMMENT '审核动态对象数',
`review_static_size` int COMMENT '审核静态对象数',
`is_rejected` int COMMENT '是否被驳回',
`_sign` UInt8
) ENGINE = CnchMergeTree(_ver)
PARTITION BY toDate(timestamp)
UNIQUE KEY (
uid,
project_id,
task_id,
timestamp
);
小结
- ByConity不需要在每个节点建本地表和分布式表,更加简洁和易维护
- ByConity的UNIQUE KEY(实现去重功能)相当于Clickhouse的ORDER BY ,语义更加合理。
2. 典型表引擎
Byconity的CnchMergeTree
(唯一键引擎)对标Clickhouse最常用的ReplacingMergeTree
Byconity的数据按分区键Partition by
进行分区,然后排序键Order by
进行有序存储
Byconity同样通过Version
的方式保留最新版本,CnchMergeTree后面直接指定version字段
2.1 Byconity版本管理
- 不使用版本管理
-- 01
CREATE TABLE UniqTest_v1
(
`key` Int64,
`val` String,
`eventTime` DateTime
)
ENGINE = CnchMergeTree()-- 未加入版本管理
ORDER BY `key`
PRIMARY KEY `key` UNIQUE KEY `key`;
insert into UniqTest_v1 VALUES (1, 'first', '2020-01-01 01:01:01');
insert into UniqTest_v1 VALUES (1, 'second', '2020-01-01 00:00:00');
结论:可以实现Upsert,但是不能保证时序
- 使用版本管理
CREATE TABLE UniqTest_v2
(
`key` Int64,
`val` String,
`eventTime` DateTime
)
ENGINE = CnchMergeTree(eventTime)
ORDER BY `key`
PRIMARY KEY `key` UNIQUE KEY `key`;
insert into UniqTest_v2 VALUES (1, 'first', '2020-01-01 01:01:01');
insert into UniqTest_v2 VALUES (1, 'second', '2020-01-01 00:00:00');
结论:可以实现Upsert,且保证了时序
3. Byconity高级特性
3.1 投影
注意投影不能和UNIQUE KEY
同时使用,下面是错误示例:
CREATE TABLE bigdata.iot_drive_base
(
`create_time` DateTime,
`local_time` DateTime64(3, 'Asia/Shanghai'),
`time` String,
`carNo` String,
`drivingMode` String,
`latitude` String,
`longitude` String,
`mode` String,
`sequence` int,
`speed` Float64,
PROJECTION prj_car (
SELECT *
ORDER BY
carNo,
`time`
)
)
ENGINE = CnchMergeTree()
PARTITION BY toYYYYMMDD(create_time)
PRIMARY KEY `time`
UNIQUE KEY (`time`, carNo)
TTL create_time + toIntervalDay(7);
-- DB::Exception: `Projection` cannot be used together with `UNIQUE KEY` SQLSTATE: 22000 (version 21.8.7.1)
- 因为投影里面的Order By已经实现了UNIQUE KEY的功能,下面是正确示例:
CREATE TABLE bigdata.iot_drive_base
(
`create_time` DateTime,
`local_time` DateTime64(3, 'Asia/Shanghai'),
`time` String,
`carNo` String,
`drivingMode` String,
`latitude` String,
`longitude` String,
`mode` String,
`sequence` int,
`speed` Float64,
PROJECTION prj_car (
SELECT *
ORDER BY
carNo,
`time`
)
)
ENGINE = CnchMergeTree()
PARTITION BY toYYYYMMDD(create_time)
PRIMARY KEY `time`
TTL create_time + toIntervalDay(7);
3.2 物化视图
注意目前Byconity官方显示还支持CnchMergeTree()
引擎使用物化视图,但是聚合视图可以使用,示例如下:
CREATE MATERIALIZED VIEW bigdata.mv_iot_agg
(
`create_date` Date,
`carNo` String,
`latitude_nan_cnt` AggregateFunction(sum, UInt64),
`latitude_null_cnt` AggregateFunction(sum, UInt64)
)
ENGINE = CnchAggregatingMergeTree
PARTITION BY toDate(create_date)
ORDER BY (create_date, carNo)
AS
SELECT toDate(create_time) AS create_date,
carNo,
sumState(toUInt64(multiIf(latitude = 'NaN', 1, 0))) AS latitude_nan_cnt,
sumState(toUInt64(multiIf(latitude = '', 1, 0))) AS latitude_null_cnt
FROM bigdata.iot_drive_base
GROUP BY create_time,
carNo;
set optimize_move_to_prewhere = 0; -- 目前的小bug,需要关闭优化器才可以正常查询
4. 函数
4.1 开窗
- 区别于Clickhouse,Byconity实现了开窗,示例如下可以直接使用:
-- a1
select carNo,
sum(diff_ts) / 3600 as mile
from (
select carNo,
toUnixTimestamp(lead_local_time) - toUnixTimestamp(local_time) as diff_ts
from (
SELECT carNo,
local_time,
lead(local_time, 1) over(PARTITION BY carNo ORDER BY local_time) AS lead_local_time
FROM bigdata.iot_drive_base
where toDate(toUnixTimestamp(create_time)) = toDate(yesterday())
) l1
) l2
where diff_ts < 60
and diff_ts >= 0
group by carNo;
-- a2
select carNo,
local_time,
sequence,
latitude,
longitude,
speed
from (
SELECT carNo,
local_time,
sequence,
latitude,
longitude,
speed,
row_number() over(PARTITION BY carNo,local_time ORDER BY create_time desc) as rn
FROM bigdata.iot_drive_base
where toDate(toUnixTimestamp(create_time)) = toDate(yesterday())
) l1
where rn = 1;
4.2 其他普通函数
-- l1
SET optimize_move_to_prewhere = 1;
select l1.carNo,
round(nvl(today_mile, 0), 0) today_mile,
round(nvl(yesterday_mile, 0), 0) yesterday_mile,
round(case when today_mile > yesterday_mile then today_mile - yesterday_mile else 0 end, 2) as up_mile
from (
select carNo,
sum(speed) * 0.5 / 1000 today_mile
from bigdata.iot_drive_base
where toDate(toUnixTimestamp(create_time)) = toDate(today())
group by carNo
) l1
left join(
select carNo,
sum(speed) * 0.5 / 1000 yesterday_mile
from bigdata.iot_drive_base
where toDate(toUnixTimestamp(create_time)) = toDate(yesterday())
group by carNo
) l2
on l1.carNo = l2.carNo;
-- l2
select toDate(toUnixTimestamp(create_time)) as dt,
groupUniqArray(carNo) as car_list,
length(groupUniqArray(carNo)) as cars
from bigdata.iot_drive_base
group by toDate(toUnixTimestamp(create_time));
5. 总结
ByConity
对Clickhouse
整体进行了封装,实现了远程挂载HDFS,实现了类似Doris开窗能力的开窗函数,实现了简易建表易于维护。总体上还不错,一些Bug的解决观望后续社区的进展。