clickhouse物化视图最佳实践样例

yangls的技术博客

已于 2023-06-14 17:28:15 修改

阅读量976

点赞数 3

文章标签： sql 数据库 sqlserver

于 2022-05-08 12:29:58 首次发布

本文链接：https://blog.csdn.net/weixin_44597427/article/details/124646009

版权

案例三 Materialized views for last point queries

案例四 using views to remember unique visitors

clickhouse的视图是一段sql逻辑很好理解，而物化视图不太一样，简单可以理解为将sql的结果存下来，这样可以加速查询，但事实当然不只是这样，最主要的是原表的变化会触发视图表的数据增加，clickhouse物化视图除了常用的sql语句，还有其独特的语法，在特定时候有奇效。

需要注意的是，物化视图不支持unoin ，在多表关联时左表新增数据会触发视图更新而右边不会，所以在使用视图的时候有许多需要注意的地方，不然会丢失数据。

CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name 
[ON CLUSTER] [TO[db.]name] [ENGINE = engine] [POPULATE] AS 
SELECT ...

上面只是简略的写法，其实clickhouse物化视图创建分为两类：后面也会讲到：

第一种创建方式

When creating a materialized view without TO [db].[table], you must specify ENGINE – the table engine for storing data.

CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]view_name 
[ON CLUSTER] 
ENGINE = engine [POPULATE] AS 
SELECT ...

使不使用populate的区别在于使用populate会在创建试图时执行select后面的sql将数据插入，也就是会有初始化数据，而不适用populate关键字就没有初始化数据，只会有源表数据更新才会有数据插入，官方不推荐使用populate，因为创建视图过程中源表插入的数据并不会在视图之中。

第二种创建方式

When creating a materialized view with TO [db].[table], you must not use POPULATE.

--一般需要先建一张表

CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]view_name 
[ON CLUSTER] TO[db.]name] 
[ENGINE = engine]  AS 
SELECT ...

案例一

CREATE TABLE chtest.tripdata
(
  id Int32,
  pickup_location_id String,
  dropoff_location_id String,
  passenger_count  Int32,
  fare_amount Int32,
  pickup_date Date
)
ENGINE = MergeTree
partition by pickup_date
ORDER BY id
SETTINGS index_granularity = 8192;

select pickup_location_id,
  sum(passenger_count)/pc_cnt AS pc_avg,
  count() AS pc_cnt
from tripdata 
group by 
  pickup_location_id
order by 
  pc_cnt desc 
limit 10;

let's make it faster!

create MATERIALIZED view tripdata_smt_mv
engine=SummingMergeTree
partition by toYYYYMM(pickup_date)
order by (pickup_location_id,dropoff_location_id)
populate AS
select
  pickup_date,
  pickup_location_id,
  dropoff_location_id,
  sum(passenger_count) AS passenger_count_sum,
  count() AS trips_count
from tripdata
group by 
  pickup_date,
  pickup_location_id,
  dropoff_location_id;

select pickup_location_id,
  sum(passenger_count_sum)/sum(trips_count) AS pc_avg,
  sum(trips_count) AS pc_cnt
from tripdata_smt_mv 
group by pickup_location_id
order by pc_cnt desc 
limit 10;

视图在这里相当于中间结果，提前做了聚合，只需要再次聚合即可得到结果。

案例二

select min(fare_amount),
  avg(fare_amount),
  max(fare_amount),
  sum(fare_amount),
  count()
from tripdata
where fare_amount>0 and fare_amount<500;


create MATERIALIZED view tripdata_agg_mv
engine=SummingMergeTree
partition by toYYYYMM(pickup_date)
order by (pickup_location_id,dropoff_location_id)
populate AS
select
  pickup_date,
  pickup_location_id,
  dropoff_location_id,
  minState(fare_amount) AS fare_amount_min,
  avgState(fare_amount) AS fare_amount_avg,
  maxState(fare_amount) AS fare_amount_max,
  sumState(fare_amount) AS fare_amount_sum,
  countState() AS fare_amount_count
from tripdata
where fare_amount>0 and fare_amount <500
group by 
  pickup_date,
  pickup_location_id,
  dropoff_location_id;

select 
  minMerge(fare_amount_min) AS fare_amount_min,
  maxMerge(fare_amount_max) AS fare_amount_max,
  avgMerge(fare_amount_avg) AS fare_amount_avg,
  sumMerge(fare_amount_sum) AS fare_amount_sum,
  countMerge(fare_amount_count) AS fare_amount_count
from tripdata_agg_mv;

案例三 Materialized views for last point queries

--cpu  源表：
create table cpu (
  tags_id UInt32,
  usage_idle Float64,
  created_at DateTime,
  time String,
  created_date Date
)
engine=MergeTree
partition by created_date
order by tags_id;


select 
  t.hostname,
  tags_id,
  100-usage_idle AS usage
from(
  select 
    tags_id,
    usage_idle
  from
    cpu
  where (tags_id,created_at) in
    (select tags_id,max(created_at) from cpu group by tag_id)
)AS c
inner join 
  tags AS t
ON c.tags_id=t.id
order by 
  usage desc,
  t.hostname asc
limit 10;

--new way to create materialized views:
create table cpu_last_point_idle_agg(
  created_date AggregateFunction(argMax,Date,DateTime),
  max_created_at AggregateFunction(max,DateTime),
  time AggregateFunction(argMax,String,DateTime),
  tags_id UInt32,
  usage_idle AggregateFunction(argMax,Float64,DateTime)
)
ENGINE=AggregatingMergeTree()
PARTITION BY tuple()
ORDER BY tags_id;
-- 'tuple()' means don't partition and don't sort data

CREATE MATERIALIZED VIEW cpu_last_point_idle_mv
to cpu_last_point_idle_agg
AS SELECT
  argMaxState(created_date,created_at) AS created_date,
  maxState(created_at) AS max_created_at,
  argMaxState(time,created_at) AS time,
  tags_id,
  argMaxState(usage_idle,created_at) AS usage_idle
from cpu
group by tags_id;

INSERT INTO cpu_last_point_idle_mv
SELECT
  argMaxState(created_date,created_at) AS created_date,
  maxState(created_at) AS max_created_at,
  argMaxState(time,created_at) AS time,
  tags_id,
  argMaxState(usage_idle,created_at) AS usage_idle
from cpu
group by tags_id;
-- populate keyword is not supported


CREATE VIEW cpu_last_point_idle_v AS
select
  argMaxMerge(created_date) AS created_date,
  maxMerge(max_created_at) AS created_at,
  argMaxMerge(time) AS time,
  tags_id,
  argMaxMerge(usage_idle) AS usage_idle
from 
  cpu_last_point_idle_mv
group by tags_id;

-- 查询视图
SELECT t.hostname,tags_id,100-usage_idle usage
from cpu_last_point_idle_v AS b
inner join  tags AS t on b.tags_id=t.id
order by 
  usage DESC , 
  t.hostname ASC
limit 10;
-- 速度快多了！！

案例四 using views to remember unique visitors

create table traffic(
    datetime DateTime,
    date Date,
    request_id UInt64,
    cust_id UInt32,
    sku UInt32
  )ENGINE=MergeTree
  PARTITION BY (cust_id ,date)
  order by cust_id
  TTL datetime + INTERVAL 90 DAY;
  
  insert into traffic values('2022-04-04 11:11:11','2022-04-04',23451231223,789564,100);
  insert into traffic values('2022-04-04 11:15:11','2022-04-04',23451231223,323322,100);
  insert into traffic values('2022-04-04 12:11:11','2022-04-04',23451231223,789564,100);
  insert into traffic values('2022-04-04 12:13:11','2022-04-04',23451231223,789564,100);

-- find which hours have most unique visits on sku #100
select 
  toStartOfHour(datetime) AS hour,
  uniq(cust_id) AS uniqs
from traffic
where sku=100
group by hour -- 这里可以发现clickhouse 的group by语句和hive是不太一样的
order by uniqs desc ,hour asc
limit 10;

CREATE TABLE traffic_uniqs_agg(
  hour DateTime,
  cust_id_uniqs AggregateFunction(uniq,UInt32),
  sku UInt32
)
ENGINE = AggregatingMergeTree()
PARTITION BY toYYYYMM(hour)
order by (sku,hour);
--

create materialized view traffic_uniqs_mv
to traffic_uniqs_agg as
select 
  toStartOfHour(datetime) as hour,
  uniqState(cust_id) as cust_id_uniqs,
  sku
from traffic
group by hour,sku;

select
  hour,
  uniqMerge(cust_id_uniqs) as uniqs
from traffic_uniqs_mv
where sku=100
group by hour
order by uniqs desc ,hour asc
limit 10;

以上就是clickhouse物化视图几种使用场景，常常是用于单表

yangls的技术博客

关注

3
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫