面试常规考点
分组取几函数的topicK
基础准备:
CREATE TABLE nx_topK_test ( a Int32,b Int32,c Int32) ENGINE = Memory;
insert into nx_topK_test ( a,b,c) values (1,2,5),(1,2,4),(1,3,8),(1,3,2),(1,4,6),(2,3,3),(2,3,7),(2,3,8),(2,4,9),(2,5,6),(3,3,4),(3,3,7),(3,3,5),(3,4,9),(3,5,6);
SELECT * FROM nx_topK_test ORDER BY a ASC;
select a,b,c from nx_topK_test order by a asc,c desc;
思路:
将数据排序--再去前几个
-- 默认,topk 只取,这一组的前k个元素SELECT a, topK(2)(c) FROM nx_topK_test group by a;
-- 所以,需要先按照要求,给数据进行排序,然后进行 topk 操作,就可以取到每组的前几个元素
SELECT a, topK(3)(c) FROM (select a,c from nx_topK_test order by a asc, c desc) group by a order by a;
窗口分析函数
Window Functions 在 clickhouse 的需求和呼声很高,早期的版本需要借助 array 函数,在 21.1 版本进行了开窗函数的初步支持。
select version();
SET allow_experimental_window_functions = 1;
create table nx_window_data_test(id String, score UInt8) engine=MergeTree() order by id;
insert into nx_window_data_test(id,score) values ('A', 90),('A', 80),('A', 88),('A', 86),('B', 91),('B', 95),('B', 90),('C', 88),('C', 89),('C', 90);
select * from nx_window_data_test;
select * from nx_window_data_test order by id, score desc;
计算分组累加
select id, score, sum(score) over ( partition by id order by score ) sum from nx_window_data_test;
select id, score, max(score) over (partition by id order by score) max from nx_window_data_test;
select id, score, min(score) over (partition by id order by score) min from nx_window_data_test;
select id, score, avg(score) over (partition by id order by score) avg from nx_window_data_test;
select id, score, count(score) over (partition by id order by score) count from nx_window_data_test;
eg:
每个用户截止到每月为止的最大单月访问次数和累计到该月的总访问次数
create table exercise_pv(id String, month String, pv UInt32) engine=MergeTree() order by id;
insert into exercise_pv(id, month, pv) values ('A', '2015-01', 33),('A', '2015-02', 10),('A', '2015-03', 38),('A', '2015-04', 20),('B', '2015-01', 30),('B', '2015-02', 15),('B', '2015-03', 44),('B', '2015-04', 35);
select * from exercise_pv;
select id,month, pv,sum(pv) over(partition by id order by month) sum,max(pv) over ( partition by id order by month) max from exercise_pv;
同比环比
同比: 跟去年同期比:同比增长率 =(本期数 - 同期数) / 同期数 = (202105 - 202005)/ 202005
环比: 跟上个月比:环比增长率 =(本期数 - 上期数) / 上期数 = (202105 - 202004)/ 202004
核心思想:
with toDate('2020-01-01') as start_date select toStartOfMonth(start_date + (number*31)) month_start, (number+20)*100 amount from numbers(24);
WITH toDate('2020-01-01') AS start_date SELECT
toStartOfMonth(start_date + (number * 31)) AS month_start,
(number + 20) * 100 AS amount,
neighbor(amount, -12) AS prev_year_amount,
neighbor(amount, -1) AS prev_month_amount
FROM numbers(24);
eg:
WITH toDate('2020-01-01') AS start_date
SELECT
toStartOfMonth(start_date + (number * 31)) AS month_start,
(number + 20) * 100 AS amount,
neighbor(amount,-12) AS prev_year_amount,
neighbor(amount,-1) AS prev_month_amount,
if(prev_year_amount = 0,-999,amount - prev_year_amount) as year_inc,
if(prev_year_amount = 0,-999,round((amount - prev_year_amount) / prev_year_amount, 4)) AS year_over_year,
if(prev_year_amount = 0,-999,amount - prev_month_amount) as month_inc,
if(prev_month_amount = 0,-999,round((amount - prev_month_amount) / prev_month_amount, 4))
AS month_over_month
FROM numbers(24);
漏斗分析: 转化? 什么叫转化 ~观察用户在那个环节丢失的
CREATE TABLE nx_window_funnel_test (uid String, eventid String, eventTime UInt64) ENGINE = Memory;
insert into nx_window_funnel_test (uid,eventid,eventTime) values ('A','login',20200101),('A','view',20200102),('A','buy',20200103),('B','login',20200101),('B','view',20200102),('C','login',20200101),('C','buy',20200102),('D','login',20200101),('D','view',20200103),('D','buy',20200102);
select * from nx_window_funnel_test order by uid;
SELECT uid, windowFunnel (2) (eventTime, eventid = 'login', eventid = 'view', eventid = 'buy') AS res
FROM nx_window_funnel_test GROUP BY uidorder by uid;
如何利用clickhouse 实现去重
1、非精确去重函数:uniq、uniqHLL12、uniqCombined、uniqCombined64
2、精确去重函数:uniqExact、groupBitmap
tips:
1、整形值精确去重场景,groupBitmap 比 uniqExact 快很多
2、groupBitmap 仅支持整形值去重, uniqExact 支持任意类型(Tuple、Array、Date、DateTime、String和数字类型)去重。
3、非精确去重场景,uniq 在精准度上有优势。
4、uniq是近似去重,千万级用户,精确度能达到99%以上,uniqExact是精确去重,和mysql的count distinct功能相同,比如统计uv。
在mysql中去重的操作.直接引擎到ck中.
在 MySQL 数据库中,有一个 nx_job 里面有一张表: job : 55875 条记录!
create database if not exists nx_job ENGINE = MySQL('bigdata02:3306', 'nx_job', 'root', 'QWer_1234');
create database if not exists nx_job1;
use nx_job1;
create table job (id UInt32,t_job String,t_addr String,t_tag String,t_com String,t_money String,t_edu String,t_exp String,t_type String,t_level String) ENGINE = MergeTree() order by id;
insert into job(id, t_job, t_addr, t_tag, t_com, t_money ,t_edu ,t_exp ,t_type ,t_level) select id, t_job, t_addr, t_tag, t_com, t_money ,t_edu ,t_exp ,t_type ,t_level from nx_job.job;
select * from job limit 3;
select count(*) as total from job;
-- 精确去重
select count(distinct id) as total from job;
select countDistinct(id) as total from job;
-- 精确去重
select uniqExact(id) from job;
select groupBitmap(id) from job;
-- 近似去重select uniq(id) from job;
select uniqHLL12(id) from job;
应用场景
select uniqExact(uvid) from tbUv where date='2020-09-06' and url='http://lol.qq.com/main.shtml';
clickhouse 整合bitmap~~大表 join 的方案!
关于 ClickHouse 的位图函数 https://clickhouse.tech/docs/zh/sql-reference/functions/bitmap-functions/
eg1:电信用户每个月的话费统计。
eg2:统计微信过去连续7天都发朋友圈的用户集。
应用场景