ClickHouse 的 SQL 优化规则是基于 RBO(Rule Based Optimization)
一、count 优化
--1. count()、count(1) 和 count(*),且没有 where 条件,则会直接使用 system.tables 的 total_rows
EXPLAIN SELECT count()FROM datasets.hits_v1;
--2. count(column),没有 Optimized trivial count 优化
EXPLAIN SELECT count(CounterID) FROM datasets.hits_v1;
二、消除子查询重复字段
--编写的 sql 中查询了重复字段
EXPLAIN SYNTAX SELECT a.UserID, b.VisitID, a.URL, b.UserID FROM hits_v1 AS a
LEFT JOIN (
SELECT
UserID,
UserID as uid,
VisitID
FROM visits_v1
) AS b USING (UserID)
limit 3;
--在 hive 中会直接查询展示两个重复字段的值
--但在 clickhouse 中会优化去除重复字段
SELECT
UserID,
VisitID,
URL,
b.UserID
FROM hits_v1 AS a
ALL LEFT JOIN
(
SELECT
UserID,
VisitID
FROM visits_v1
) AS b USING (UserID)
LIMIT 3;
三、谓词下推
通俗的说是指提前过滤
--1. 当 group by 有 having 子句,但是没有 with cube、with rollup 或者 with totals 修饰的时,having 过滤会下推到 where 提前过滤
EXPLAIN SYNTAX SELECT UserID FROM hits_v1 GROUP BY UserID HAVING UserID = '8585742290196126178';
--返回优化语句
SELECT UserID FROM hits_v1 WHERE UserID = '8585742290196126178' GROUP BY UserID
--2. 子查询的谓词下推 (ps:hive 中子查询不会谓词下推)
EXPLAIN SYNTAX
SELECT *
FROM
(
SELECT
UserID
FROM visits_v1
)
WHERE UserID = '8585742290196126178'
--返回优化后的语句
SELECT UserID
FROM
(
SELECT
UserID
FROM visits_v1
WHERE UserID = '8585742290196126178'
)
WHERE UserID = '8585742290196126178'
EXPLAIN SYNTAX
SELECT * FROM
(
SELECT
*
FROM
(
SELECT
UserID
FROM visits_v1
)
UNION ALL
SELECT
*
FROM
(
SELECT
UserID
FROM visits_v1
)
)
WHERE UserID = '8585742290196126178'
--返回优化后的语句
SELECT UserID
FROM
(
SELECT UserID
FROM
(
SELECT UserID
FROM visits_v1
WHERE UserID = '8585742290196126178'
)
WHERE UserID = '8585742290196126178'
UNION ALL
SELECT UserID
FROM
(
SELECT UserID
FROM visits_v1
WHERE UserID = '8585742290196126178'
)
WHERE UserID = '8585742290196126178'
)
WHERE UserID = '8585742290196126178'
四、聚合计算外推
--聚合函数内的计算,会外推 (ps:hive中不会外推)
EXPLAIN SYNTAX SELECT sum(UserID * 2) FROM visits_v1;
--返回优化后的语句
SELECT sum(UserID) * 2 FROM visits_v1
五、聚合函数消除
--对于无意义的聚合函数会优化消除
EXPLAIN SYNTAX
SELECT
sum(UserID * 2),
max(VisitID),
max(UserID)
FROM visits_v1
GROUP BY UserID
--返回优化后的语句
SELECT
sum(UserID) * 2,
max(VisitID),
UserID
FROM visits_v1
GROUP BY UserID
六、不同语法删除重复的 key
--order by
EXPLAIN SYNTAX
SELECT *
FROM visits_v1
ORDER BY
UserID ASC,
UserID ASC,
VisitID ASC,
VisitID ASC
--返回优化后的语句:
select
*
FROM visits_v1
ORDER BY
UserID ASC,
VisitID ASC
--limit by
EXPLAIN SYNTAX
SELECT *
FROM visits_v1
LIMIT 3 BY
VisitID,
VisitID
LIMIT 10
--返回优化后的语句:
select
*
FROM visits_v1
LIMIT 3 BY VisitID
LIMIT 10
--using
EXPLAIN SYNTAX
SELECT
a.UserID,
a.UserID,
b.VisitID,
a.URL,
b.UserID
FROM hits_v1 AS a
LEFT JOIN visits_v1 AS b USING (UserID, UserID)
--返回优化后的语句:
SELECT
UserID,
UserID,
VisitID,
URL,
b.UserID
FROM hits_v1 AS a
ALL LEFT JOIN visits_v1 AS b USING (UserID)
七、标量替换
--如果子查询只返回一行数据,在被引用的时候用标量替换
EXPLAIN SYNTAX
WITH
(
SELECT sum(bytes)
FROM system.parts
WHERE active
) AS total_disk_usage
SELECT
(sum(bytes) / total_disk_usage) * 100 AS table_disk_usage,
table
FROM system.parts
GROUP BY table
ORDER BY table_disk_usage DESC
LIMIT 10;
--返回优化后的语句:
WITH CAST(0, 'UInt64') AS total_disk_usage
SELECT
(sum(bytes) / total_disk_usage) * 100 AS table_disk_usage,
table
FROM system.parts
GROUP BY table
ORDER BY table_disk_usage DESC
LIMIT 10
八、三元运算符优化
--开启 optimize_if_chain_to_multiif 参数,会进行三元运算符优化
EXPLAIN SYNTAX
SELECT number = 1 ? 'hello' : (number = 2 ? 'world' : 'atguigu')
FROM numbers(10)
settings optimize_if_chain_to_multiif = 1;
--返回优化后的语句:
SELECT multiIf(number = 1, 'hello', number = 2, 'world', 'atguigu')
FROM numbers(10) SETTINGS optimize_if_chain_to_multiif = 1