大数据应用之 — clickhouse应用实操
在集群上创建库
CREATE DATABASE IF NOT EXISTS tutorial on cluster cluster_lsyk
在任意一台节点执行,发现其他节点都有了
创建集群表
在lsyk01上创建执行:
create table st_order_mt_c on cluster cluster_lsyk (
id UInt32,
sku_id String,
total_amount Decimal(16,2),
create_time Datetime
) engine
=ReplicatedMergeTree('/clickhouse/tables/{shard}/st_order_mt_c','{replica}')
partition by toYYYYMMDD(create_time)
primary key (id)
order by (id,sku_id);
发现其他节点也都有了这个表
其中的ReplicatedMergeTree 中,
第一个参数是分片的 zk_path 一般按照:
/clickhouse/table/{shard}/{table_name} 的格式,写,如果只有一个分片就写 01 即可。
第二个参数是副本名称,相同的分片副本名称不能相同。
创建分布式表
在lsyk01上执行:
create table st_order_mt_d on cluster cluster_lsyk
(
id UInt32,
sku_id String,
total_amount Decimal(16,2),
create_time Datetime
)engine = Distributed(cluster_lsyk,default, st_order_mt_c,hiveHash(sku_id));
--插入数据
insert into st_order_mt_d values
(201,'sku_001',1000.00,'2020-06-01 12:00:00') ,
(202,'sku_002',2000.00,'2020-06-01 12:00:00'),
(203,'sku_004',2500.00,'2020-06-01 12:00:00'),
(204,'sku_002',2000.00,'2020-06-01 12:00:00'),
(205,'sku_003',600.00,'2020-06-02 12:00:00');
Distributed(集群名称,库名,本地表名,分片键)
分片键必须是整型数字,所以用 hiveHash 函数转换,也可以 rand()
查询发现:lsyk01上是3条,lsyk02上也是3条(是lsyk01的副本),lsyk03上是2条
实操
配置集群
配置成3分片1副本的集群
/etc/clickhouse-server/config.d / metrika.xml 的标签增加以下内容
<cluster_lsyk_s1r1> <!-- 集群名称-->
<shard> <!--集群的第1个分片-->
<internal_replication>true</internal_replication>
<!--该分片的第1个副本-->
<replica>
<host>lsyk01</host>
<port>9000</port>
</replica>
</shard>
<shard> <!--集群的第2个分片-->
<internal_replication>true</internal_replication>
<!--该分片的第1个副本-->
<replica>
<host>lsyk02</host>
<port>9000</port>
</replica>
</shard>
<shard> <!--集群的第3个分片-->
<internal_replication>true</internal_replication>
<!--该分片的第1个副本-->
<replica>
<host>lsyk03</host>
<port>9000</port>
</replica>
</shard>
</cluster_lsyk_s1r1>
每台重启集群
官网下载数据
https://datasets.clickhouse.com/hits/tsv/hits_v1.tsv.xz 802M
https://datasets.clickhouse.com/visits/tsv/visits_v1.tsv.xz 405M
创建表
CREATE TABLE tutorial.hits_v1_local on cluster cluster_lsyk_s1r1
(
`WatchID` UInt64,
`JavaEnable` UInt8,
`Title` String,
`GoodEvent` Int16,
`EventTime` DateTime,
`EventDate` Date,
`CounterID` UInt32,
`ClientIP` UInt32,
`ClientIP6` FixedString(16),
`RegionID` UInt32,
`UserID` UInt64,
`CounterClass` Int8,
`OS` UInt8,
`UserAgent` UInt8,
`URL` String,
`Referer` String,
`URLDomain` String,
`RefererDomain` String,
`Refresh` UInt8,
`IsRobot` UInt8,
`RefererCategories` Array(UInt16),
`URLCategories` Array(UInt16),
`URLRegions` Array(UInt32),
`RefererRegions` Array(UInt32),
`ResolutionWidth` UInt16,
`ResolutionHeight` UInt16,
`ResolutionDepth` UInt8,
`FlashMajor` UInt8,
`FlashMinor` UInt8,
`FlashMinor2` String,
`NetMajor` UInt8,
`NetMinor` UInt8,
`UserAgentMajor` UInt16,
`UserAgentMinor` FixedString(2),
`CookieEnable` UInt8,
`JavascriptEnable` UInt8,
`IsMobile` UInt8,
`MobilePhone` UInt8,
`MobilePhoneModel` String,
`Params` String,
`IPNetworkID` UInt32,
`TraficSourceID` Int8,
`SearchEngineID` UInt16,
`SearchPhrase` String,
`AdvEngineID` UInt8,
`IsArtifical` UInt8,
`WindowClientWidth` UInt16,
`WindowClientHeight` UInt16,
`ClientTimeZone` Int16,
`ClientEventTime` DateTime,
`SilverlightVersion1` UInt8,
`SilverlightVersion2` UInt8,
`SilverlightVersion3` UInt32,
`SilverlightVersion4` UInt16,
`PageCharset` String,
`CodeVersion` UInt32,
`IsLink` UInt8,
`IsDownload` UInt8,
`IsNotBounce` UInt8,
`FUniqID` UInt64,
`HID` UInt32,
`IsOldCounter` UInt8,
`IsEvent` UInt8,
`IsParameter` UInt8,
`DontCountHits` UInt8,
`WithHash` UInt8,
`HitColor` FixedString(1),
`UTCEventTime` DateTime,
`Age` UInt8,
`Sex` UInt8,
`Income` UInt8,
`Interests` UInt16,
`Robotness` UInt8,
`GeneralInterests` Array(UInt16),
`RemoteIP` UInt32,
`RemoteIP6` FixedString(16),
`WindowName` Int32,
`OpenerName` Int32,
`HistoryLength` Int16,
`BrowserLanguage` FixedString(2),
`BrowserCountry` FixedString(2),
`SocialNetwork` String,
`SocialAction` String,
`HTTPError` UInt16,
`SendTiming` Int32,
`DNSTiming` Int32,
`ConnectTiming` Int32,
`ResponseStartTiming` Int32,
`ResponseEndTiming` Int32,
`FetchTiming` Int32,
`RedirectTiming` Int32,
`DOMInteractiveTiming` Int32,
`DOMContentLoadedTiming` Int32,
`DOMCompleteTiming` Int32,
`LoadEventStartTiming` Int32,
`LoadEventEndTiming` Int32,
`NSToDOMContentLoadedTiming` Int32,
`FirstPaintTiming` Int32,
`RedirectCount` Int8,
`SocialSourceNetworkID` UInt8,
`SocialSourcePage` String,
`ParamPrice` Int64,
`ParamOrderID` String,
`ParamCurrency` FixedString(3),
`ParamCurrencyID` UInt16,
`GoalsReached` Array(UInt32),
`OpenstatServiceName` String,
`OpenstatCampaignID` String,
`OpenstatAdID` String,
`OpenstatSourceID` String,
`UTMSource` String,
`UTMMedium` String,
`UTMCampaign` String,
`UTMContent` String,
`UTMTerm` String,
`FromTag` String,
`HasGCLID` UInt8,
`RefererHash` UInt64,
`URLHash` UInt64,
`CLID` UInt32,
`YCLID` UInt64,
`ShareService` String,
`ShareURL` String,
`ShareTitle` String,
`ParsedParams` Nested(
Key1 String,
Key2 String,
Key3 String,
Key4 String,
Key5 String,
ValueDouble Float64),
`IslandID` FixedString(16),
`RequestNum` UInt32,
`RequestTry` UInt8
)
ENGINE = MergeTree()
PARTITION BY toYYYYMM(EventDate)
ORDER BY (CounterID, EventDate, intHash32(UserID))
SAMPLE BY intHash32(UserID);
创建分布式表
CREATE TABLE tutorial.hits_v1_all on cluster cluster_lsyk_s1r1 AS tutorial.hits_v1_local
ENGINE = Distributed(cluster_lsyk_s1r1, tutorial, hits_v1_local, rand());
如果不加 on cluster选项,则只在本节点创建。加上后会在该集群的所有节点创建。
对于删除的也是如此,可以得出,on cluster 选项就是个通用的选项,可以在集群所有节点执行
drop table tutorial.hits_v1_all on cluster cluster_lsyk_s1r1
导入数据
clickhouse-client --query "INSERT INTO tutorial.hits_v1_all FORMAT TSV" --max_insert_block_size=100000 < /softw/clickhouse/hits_v1.tsv
服务器:笔记本虚拟的3个rhel7.8
集群:3节点,3切片1副本
文本大小:7.3G
导入clickhouse:大小约2.8G
记录数:887W
字段:133
导入时间:180s
查询测试
select count(1) from
(SELECT
UserAgent,
WindowClientHeight,
SilverlightVersion1,
CodeVersion,
Age,
BrowserLanguage,
CounterID,
count(1),
sum(RequestNum),
sum(ResolutionWidth),
count(distinct RefererDomain)
FROM tutorial.hits_v1_all
GROUP BY
UserAgent,
WindowClientHeight,
SilverlightVersion1,
CodeVersion,
Age,
BrowserLanguage,
CounterID
ORDER BY
UserAgent ASC,
WindowClientHeight ASC,
SilverlightVersion1 ASC,
CodeVersion ASC,
Age ASC,
BrowserLanguage ASC,
CounterID ASC
)
增加过滤条件后,时间会更少
减少维度的情况下:
再次减少维度:
再次减少维度: