clickhouse在做用户画像标签时,怎么去做圈选,表结构应该是怎么样的,我们应该怎么去处理,能够使其高性能的圈选,尽可能缩小其占用的存储空间?
这个问题,我通过代码给大家做下的演示
先在hive中对数据预处理
--最初表结构
create table f_tag_table(
tag Int,
tag_name String,
cust_num String
);
--插入数据
INSERT INTO f_tag_table values
(1001, '存款客户', '1,2,3'),
(2001,'国债客户', '2,3,4'),
(3001,'卡黑名单客户', '2,4'),
(4001,'短信黑名单', '3,4');
现在的表是这样的:
tag | tag_name | cust_num |
1001 | 存款客户 | 1,2,3 |
2001 | 国债客户 | 2,3,4 |
3001 | 卡黑名单客户 | 2,4 |
4001 | 短信黑名单 | 3,4 |
重新建一张表
create table usr_table(
id int,
tag_name varchar(30),
tag int
);
把表f_tag_table的数据插入进去
insert into usr_table
select usr, tag_name, tag from f_tag_table
lateral view explode(split(cust_num, ',')) tmp as usr;
现在的表结构如下
usr | tag_name | tag |
1 | 存款客户 | 1001 |
2 | 存款客户 | 1001 |
3 | 存款客户 | 1001 |
2 | 国债客户 | 2001 |
3 | 国债客户 | 2001 |
4 | 国债客户 | 2001 |
2 | 卡黑名单客户 | 3001 |
4 | 卡黑名单客户 | 3001 |
3 | 短信黑名单 | 4001 |
4 | 短信黑名单 | 4001 |
--写入到文件中
./hive -e "
select
*
from ck.usr_table
" | tr "\t" "," > /fileName.csv
--在clickhouse中建库建表
create database ck;
use ck;
create table usr_table(
id UInt32,
tag_name String,
tag UInt32
)ENGINE = MergeTree()
ORDER BY id;
--写入到CK中
clickhouse-client -m -u default -h 192.168.88.161
--query 'INSERT INTO ck.usr_table FORMAT CSV' < /fileName.csv
-- 存储表结构
create table tag_table(
tag UInt32,
tag_name String,
cust_num AggregateFunction(groupBitmap, UInt64 )
)ENGINE = AggregatingMergeTree()
ORDER BY (tag, tag_name)
SETTINGS index_granularity = 128;
insert into tag_table
select
tag,
tag_name,
groupBitmapState(toUInt64(id)) as cust_num
from usr_table group by tag,tag_name;
--查询表
select tag, tag_name, bitmapToArray(cust_num) from tag_table;
现在表是这样的:
tag | tag_name | cust_num |
1001 | 存款客户 | [1,2,3] |
2001 | 国债客户 | [2,3,4] |
3001 | 卡黑名单客户 | [2,4] |
4001 | 短信黑名单 | [3,4] |
用bitmap进行圈选
WITH
(
SELECT cust_num from tag_table where tag ='1001'
) AS tag1,
(
SELECT cust_num from tag_table where tag ='2001'
) AS tag2,
(
SELECT cust_num from tag_table where tag ='3001'
) AS tag3,
(
SELECT cust_num from tag_table where tag ='4001'
) AS tag4
select bitmapToArray(bitmapAndnot(bitmapOr(tag1, tag2),bitmapOr(tag3, tag4)))
as customer;