1、创建表
CREATE TABLE queryDatatest ON CLUSTER default_cluster
(
`query` String,
`partition_stat_date` Date
)
ENGINE = MergeTree(partition_stat_date,
query);
CREATE TABLE hm.dis_queryIpData ON CLUSTER default_cluster
AS queryIpData
ENGINE = Distributed(default_cluster, queryIpData, rand());
2、删数据
(1)删表
DROP table queryData ON CLUSTER default_cluster;
(2)删账期
ALTER TABLE queryData ON CLUSTER default_cluster DELETE WHERE partition_stat_date = '2023-02-18’;
(3)删字段
alter table Data on cluster default_cluster drop column `ip`
(4)清空表
truncate table ipDatatest ON CLUSTER default_cluster ;
3、查数据
clickhouse-client -h ip -m -u admin --password 密码 --database="table" --query="select distinct(query),query_num from queryData where partition_stat_date = '2021-06-06' order by query_num desc limit 50000 FORMAT CSV" > test.csv
4、Python调用
CLICKHOUSE_URL_NEW = "jdbc:clickhouse://ip:port/db"
CLICKHOUSE_DRIVER_NEW = 'ru.yandex.clickhouse.ClickHouseDriver'
set spark = SparkSession \
.builder \
.appName("clickhouse_data") \
.enableHiveSupport() \
.config("spark.master", 'yarn') \
.config("spark.debug.maxToStringFields", "100") \
.config("spark.sql.adaptive.enabled", "true") \
.config("spark.sql.adaptive.allowAdditionalShuffle", "true") \
.config("spark.sql.adaptive.autoCalculateInitialPartitionNum ", "true") \
.config("spark.sql.adaptive.shuffle.targetPostShuffleInputSize", 128000000) \
.getOrCreate()
self.spark = spark
clickhouse_prop_new = {'driver': CLICKHOUSE_DRIVER_NEW,
"socket_timeout": "300000",
"rewriteBatchedStatements": "true",
"batchsize": "1000000",
"numPartitions": "1",
'user': 'admin',
'password': '***'}
self.clickhouse_prop_new = clickhouse_prop_new
selectSqlWise = '''select 字段1 from test where stat_date = '{}'''.format(datestr)
result = self.spark.sql(selectSqlWise)
result.show()
result.write.jdbc(url=CLICKHOUSE_URL_NEW, table='Data', mode='append',
properties=self.clickhouse_prop_new)