目录
案例三 Materialized views for last point queries
案例四 using views to remember unique visitors
clickhouse的视图是一段sql逻辑很好理解,而物化视图不太一样,简单可以理解为将sql的结果存下来,这样可以加速查询,但事实当然不只是这样,最主要的是原表的变化会触发视图表的数据增加,clickhouse物化视图除了常用的sql语句,还有其独特的语法,在特定时候有奇效。
需要注意的是,物化视图不支持unoin ,在多表关联时左表新增数据会触发视图更新而右边不会,所以在使用视图的时候有许多需要注意的地方,不然会丢失数据。
CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] [TO[db.]name] [ENGINE = engine] [POPULATE] AS SELECT ...
上面只是简略的写法,其实clickhouse物化视图创建分为两类:后面也会讲到:
第一种创建方式
When creating a materialized view without TO [db].[table], you must specify ENGINE – the table engine for storing data.
CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]view_name
[ON CLUSTER]
ENGINE = engine [POPULATE] AS
SELECT ...
使不使用populate的区别在于使用populate会在创建试图时执行select后面的sql将数据插入,也就是会有初始化数据,而不适用populate关键字就没有初始化数据,只会有源表数据更新才会有数据插入,官方不推荐使用populate,因为创建视图过程中源表插入的数据并不会在视图之中。
第二种创建方式
When creating a materialized view with TO [db].[table], you must not use POPULATE.
--一般需要先建一张表
CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]view_name
[ON CLUSTER] TO[db.]name]
[ENGINE = engine] AS
SELECT ...
案例一
CREATE TABLE chtest.tripdata
(
id Int32,
pickup_location_id String,
dropoff_location_id String,
passenger_count Int32,
fare_amount Int32,
pickup_date Date
)
ENGINE = MergeTree
partition by pickup_date
ORDER BY id
SETTINGS index_granularity = 8192;
select pickup_location_id,
sum(passenger_count)/pc_cnt AS pc_avg,
count() AS pc_cnt
from tripdata
group by
pickup_location_id
order by
pc_cnt desc
limit 10;
let's make it faster!
create MATERIALIZED view tripdata_smt_mv
engine=SummingMergeTree
partition by toYYYYMM(pickup_date)
order by (pickup_location_id,dropoff_location_id)
populate AS
select
pickup_date,
pickup_location_id,
dropoff_location_id,
sum(passenger_count) AS passenger_count_sum,
count() AS trips_count
from tripdata
group by
pickup_date,
pickup_location_id,
dropoff_location_id;
select pickup_location_id,
sum(passenger_count_sum)/sum(trips_count) AS pc_avg,
sum(trips_count) AS pc_cnt
from tripdata_smt_mv
group by pickup_location_id
order by pc_cnt desc
limit 10;
视图在这里相当于中间结果,提前做了聚合,只需要再次聚合即可得到结果。
案例二
select min(fare_amount),
avg(fare_amount),
max(fare_amount),
sum(fare_amount),
count()
from tripdata
where fare_amount>0 and fare_amount<500;
create MATERIALIZED view tripdata_agg_mv
engine=SummingMergeTree
partition by toYYYYMM(pickup_date)
order by (pickup_location_id,dropoff_location_id)
populate AS
select
pickup_date,
pickup_location_id,
dropoff_location_id,
minState(fare_amount) AS fare_amount_min,
avgState(fare_amount) AS fare_amount_avg,
maxState(fare_amount) AS fare_amount_max,
sumState(fare_amount) AS fare_amount_sum,
countState() AS fare_amount_count
from tripdata
where fare_amount>0 and fare_amount <500
group by
pickup_date,
pickup_location_id,
dropoff_location_id;
select
minMerge(fare_amount_min) AS fare_amount_min,
maxMerge(fare_amount_max) AS fare_amount_max,
avgMerge(fare_amount_avg) AS fare_amount_avg,
sumMerge(fare_amount_sum) AS fare_amount_sum,
countMerge(fare_amount_count) AS fare_amount_count
from tripdata_agg_mv;
案例三 Materialized views for last point queries
--cpu 源表:
create table cpu (
tags_id UInt32,
usage_idle Float64,
created_at DateTime,
time String,
created_date Date
)
engine=MergeTree
partition by created_date
order by tags_id;
select
t.hostname,
tags_id,
100-usage_idle AS usage
from(
select
tags_id,
usage_idle
from
cpu
where (tags_id,created_at) in
(select tags_id,max(created_at) from cpu group by tag_id)
)AS c
inner join
tags AS t
ON c.tags_id=t.id
order by
usage desc,
t.hostname asc
limit 10;
--new way to create materialized views:
create table cpu_last_point_idle_agg(
created_date AggregateFunction(argMax,Date,DateTime),
max_created_at AggregateFunction(max,DateTime),
time AggregateFunction(argMax,String,DateTime),
tags_id UInt32,
usage_idle AggregateFunction(argMax,Float64,DateTime)
)
ENGINE=AggregatingMergeTree()
PARTITION BY tuple()
ORDER BY tags_id;
-- 'tuple()' means don't partition and don't sort data
CREATE MATERIALIZED VIEW cpu_last_point_idle_mv
to cpu_last_point_idle_agg
AS SELECT
argMaxState(created_date,created_at) AS created_date,
maxState(created_at) AS max_created_at,
argMaxState(time,created_at) AS time,
tags_id,
argMaxState(usage_idle,created_at) AS usage_idle
from cpu
group by tags_id;
INSERT INTO cpu_last_point_idle_mv
SELECT
argMaxState(created_date,created_at) AS created_date,
maxState(created_at) AS max_created_at,
argMaxState(time,created_at) AS time,
tags_id,
argMaxState(usage_idle,created_at) AS usage_idle
from cpu
group by tags_id;
-- populate keyword is not supported
CREATE VIEW cpu_last_point_idle_v AS
select
argMaxMerge(created_date) AS created_date,
maxMerge(max_created_at) AS created_at,
argMaxMerge(time) AS time,
tags_id,
argMaxMerge(usage_idle) AS usage_idle
from
cpu_last_point_idle_mv
group by tags_id;
-- 查询视图
SELECT t.hostname,tags_id,100-usage_idle usage
from cpu_last_point_idle_v AS b
inner join tags AS t on b.tags_id=t.id
order by
usage DESC ,
t.hostname ASC
limit 10;
-- 速度快多了!!
案例四 using views to remember unique visitors
create table traffic(
datetime DateTime,
date Date,
request_id UInt64,
cust_id UInt32,
sku UInt32
)ENGINE=MergeTree
PARTITION BY (cust_id ,date)
order by cust_id
TTL datetime + INTERVAL 90 DAY;
insert into traffic values('2022-04-04 11:11:11','2022-04-04',23451231223,789564,100);
insert into traffic values('2022-04-04 11:15:11','2022-04-04',23451231223,323322,100);
insert into traffic values('2022-04-04 12:11:11','2022-04-04',23451231223,789564,100);
insert into traffic values('2022-04-04 12:13:11','2022-04-04',23451231223,789564,100);
-- find which hours have most unique visits on sku #100
select
toStartOfHour(datetime) AS hour,
uniq(cust_id) AS uniqs
from traffic
where sku=100
group by hour -- 这里可以发现clickhouse 的group by语句和hive是不太一样的
order by uniqs desc ,hour asc
limit 10;
CREATE TABLE traffic_uniqs_agg(
hour DateTime,
cust_id_uniqs AggregateFunction(uniq,UInt32),
sku UInt32
)
ENGINE = AggregatingMergeTree()
PARTITION BY toYYYYMM(hour)
order by (sku,hour);
--
create materialized view traffic_uniqs_mv
to traffic_uniqs_agg as
select
toStartOfHour(datetime) as hour,
uniqState(cust_id) as cust_id_uniqs,
sku
from traffic
group by hour,sku;
select
hour,
uniqMerge(cust_id_uniqs) as uniqs
from traffic_uniqs_mv
where sku=100
group by hour
order by uniqs desc ,hour asc
limit 10;
以上就是clickhouse物化视图几种使用场景,常常是用于单表