ClickHouse
General
https://clickhouse.com/docs/en/tutorial/
- OLTP:是传统的关系型数据库,主要操作增删改查,强调事务一致性,比如银行系统、电商系统。
- OLAP:是仓库型数据库,主要是读取数据,做复杂数据分析,侧重技术决策支持,提供直观简单的结果。
- 应用场景
ClickHouse做为列式数据库,列式数据库更适合OLAP场景,OLAP场景的关键特征:
- 绝大多数是读请求
- 数据以相当大的批次(> 1000行)更新,而不是单行更新;或者根本没有更新。
- 已添加到数据库的数据不能修改。
- 对于读取,从数据库中提取相当多的行,但只提取列的一小部分。
- 宽表,即每个表包含着大量的列
- 查询相对较少(通常每台服务器每秒查询数百次或更少)
- 对于简单查询,允许延迟大约50毫秒
- 列中的数据相对较小:数字和短字符串(例如,每个URL 60个字节)
- 处理单个查询时需要高吞吐量(每台服务器每秒可达数十亿行)
- 事务不是必须的
- 对数据一致性要求低
- 每个查询有一个大表。除了他以外,其他的都很小。
- 查询结果明显小于源数据。数据经过过滤或聚合,结果适合于单个服务器的RAM中。
https://blog.51cto.com/u_15127633/4188356
https://www.cnblogs.com/shengyang17/p/10894490.html
- Install & Start at Linux
curl https://clickhouse.com/ | sh
sudo ./clickhouse install
sudo clickhouse start
ps -ef | grep clickhouse
PS: Install From Deb
sudo apt-get install -y apt-transport-https ca-certificates dirmngr
sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 8919F6BD2B48D754
echo "deb https://packages.clickhouse.com/deb stable main" | sudo tee \
/etc/apt/sources.list.d/clickhouse.list
sudo apt-get update
sudo apt-get install -y clickhouse-server clickhouse-client
sudo service clickhouse-server start
clickhouse-client # or "clickhouse-client --password" if you've set up a password.
clickhouse-client --password
密码可在
/etc/clickhouse-server/users.d/default-password.xml
中配置
Some Command
- CREATE TABLE
CREATE TABLE test.Texture (
ID UInt16,
FileName VARCHAR(260),
Width UInt16,
Height UInt16,
TextureType VARCHAR(32),
Multiple4 UInt16,
Is2N UInt16,
EdgeType UInt16
) ENGINE = MergeTree Primary Key(ID);
CREATE TABLE test.TextureInfo_20220913 (
ID Int64,
sID Int64,
tID Int64,
FileName VARCHAR(260),
FileExtName VARCHAR(32),
FileSize Int64,
SvnRevision Int64,
SvnDate datetime,
SvnAuthor VARCHAR(32),
CheckSum VARCHAR(260),
Width Int64,
Height Int64,
TextureType VARCHAR(32),
Multiple4 Int16,
Is2N Int16,
EdgeType Int16
) ENGINE = MergeTree Primary Key(ID);
- INSERT TABLE FROM MySQL
INSERT INTO Ani SELECT * FROM mysql('ip:port', 'db_name', 'table_name', 'user', 'password')
- ALERT TABLE COLUMN Nullable
alter table `test`.SvnFileInfo MODIFY COLUMN FileExtName Nullable(String)
- JOIN
select `SvnFileInfo`.`ID` AS `ID`,`Ani`.`ID` AS `tID`,`SvnFileInfo`.`FileName` AS `FileName`,`SvnFileInfo`.`FileExtName` AS `FileExtName`,
`SvnFileInfo`.`FileSize` AS `FileSize`,`SvnFileInfo`.`SvnRevision` AS `SvnRevision`,`SvnFileInfo`.`SvnDate` AS `SvnDate`,`SvnFileInfo`.`SvnAuthor` AS `SvnAuthor`,
`SvnFileInfo`.`CheckSum` AS `CheckSum`,`Ani`.`BoneCnt` AS `BoneCnt`,`Ani`.`VertexCnt` AS `VertexCnt` from `SvnFileInfo` ANY INNER JOIN `Ani` on(`SvnFileInfo`.`FileName` = `Ani`.`FileName`)
- DROP COLUMN
ALTER TABLE test.TextureInfo_20220913 DROP COLUMN sID;
- BATCH INSERT
"""INSERT INTO DailySource.SceneList_{version} FORMAT JSONEachRow {data_json}""".format(
version="HD20220913T01_02_27",
data_json=json.dumps(list_all))
list_all = [{k: v}, {k: v}, {k: v}, {k: v} ... ]
- JOIN
https://blog.csdn.net/vkingnew/article/details/107160616
Patch Insert From Other Data Source
https://blog.csdn.net/qq_41902618/article/details/121884368
https://www.jianshu.com/p/3ec5399baa23
Python Api
https://blog.csdn.net/qq_41902618/article/details/121884368
https://clickhouse-driver.readthedocs.io/en/latest/quickstart.html#inserting-data
- demo
# @Time : 2022/9/12 21:11
# @Author :
import json
import logging
from clickhouse_sqlalchemy import make_session
from sqlalchemy import create_engine
import pandas as pd
import datetime
import time
from click_house_tool.ms_task import get_mysql_db
from click_house_tool.table_ddl import create_ani, create_srt, create_mesh, create_pss, \
create_texture, create_SvnFileInfo, create_FileDepends, Insert_MySQL2CK
from config.db_config import sql_utils
from config.db_config.sql_utils import POOL_SOURCE, POOL_ANALYSIS
conf = {
"user": "default",
"password": "password",
"server_host": "10.11.ip.ip",
"port": "port",
"db": "db_name"
}
Insert_MySQL2CK = """INSERT INTO %s SELECT * FROM mysql('IP:PORT', '%s', '%s', 'user', 'password')"""
print(datetime.datetime.today().strftime('%Y%m%d'))
connection = 'clickhouse://{user}:{password}@{server_host}:{port}/{db}'.format(**conf)
engine = create_engine(connection, pool_size=100, pool_recycle=3600, pool_timeout=20)
all_table_list = [{"AniInfo": create_ani},
{"MeshInfo": create_mesh},
{"PssInfo": create_pss},
{"SrtInfo": create_srt},
{"TextureInfo": create_texture},
{"FileDepends": create_FileDepends},
{"SvnFileInfo": create_SvnFileInfo}]
def exe_ck_sql(sql="show tables"):
session = make_session(engine)
cursor = session.execute(sql)
try:
fields = cursor._metadata.keys
df = pd.DataFrame([dict(zip(fields, item)) for item in cursor.fetchall()])
print(df)
except Exception as e:
logging.info("ERROR: %s", e)
logging.info("SQL: %s", sql)
finally:
cursor.close()
session.close()
def create_db(create_table=create_srt, version="HD20220913T01_02_27"):
sql_create_table = create_table % version
exe_ck_sql(sql_create_table)
def get_all_table():
session = make_session(engine)
cursor = session.execute("show tables")
try:
res = cursor.fetchall()
return res
except Exception as e:
logging.info("CREATE_ERROR: %s", e)
finally:
cursor.close()
session.close()
def insert_db(table="SrtInfo", version="HD20220913T01_02_27"):
table_name = table + "_" + version
sql_insert_table = Insert_MySQL2CK % (table_name, version, table)
exe_ck_sql(sql_insert_table)
# 插入规则详情
def insert_rule():
table_name = ["hook_mesh_item", "hook_mesh_player", "hook_pss"]
source_table_name = ["hook_svn_mesh_item_new", "hook_svn_mesh_player_new", "hook_svn_pss"]
for t, s_t in zip(table_name, source_table_name):
sql_insert_table = """INSERT INTO %s SELECT * FROM mysql('IP:PORT', '%s', '%s',
'user', 'password')""" % (t, "hook_trunk", s_t)
print(sql_insert_table)
session = make_session(engine)
cursor = session.execute(sql_insert_table)
try:
fields = cursor._metadata.keys
df = pd.DataFrame([dict(zip(fields, item)) for item in cursor.fetchall()])
print(df)
except Exception as e:
logging.info("INSERT_ERROR: %s", e)
finally:
cursor.close()
session.close()
def insert_svn_info(version="HD20220913T01_02_27"):
sql_p = """SELECT FileName, COUNT(FileName) AS ParentCount FROM %s.`FileDepends` GROUP BY FileName""" % version
sql_c = """SELECT DependFile, COUNT(DependFile) AS ChildrenCount FROM %s.`FileDepends` GROUP BY DependFile""" % version
sql_select = """select * from %s.SvnFileInfo where id >= %s limit 100000"""
sql_util_source = sql_utils.SqlUtil(POOL_SOURCE)
p_dict = {}
c_dict = {}
p_tuple = sql_util_source.select_all_data_list(sql_p)
for p_t in p_tuple:
p_dict[p_t[0].lower()] = p_t[1]
c_tuple = sql_util_source.select_all_data_list(sql_c)
for c_t in c_tuple:
c_dict[c_t[0].lower()] = c_t[1]
f_count = 0
f_size = 0
id_s = 1
while True:
data_list = sql_util_source.select_all_data(sql_select % (version, id_s))
id_s += 100000
for d_l in data_list:
d_l["ParentCount"] = p_dict.get(d_l["FileName"].lower())
d_l["ChildrenCount"] = c_dict.get(d_l["FileName"].lower())
f_count += 1
f_size += d_l["FileSize"]
mul_insert = """INSERT INTO DailySource.SvnFileInfo_{version} FORMAT JSONEachRow {data_json}""".format(
version=version,
data_json=json.dumps(data_list, default=str))
exe_ck_sql(mul_insert)
if len(data_list) < 100000:
break
return f_count, f_size
def modify_report(ver, f_count, f_size):
sql_util_source = sql_utils.SqlUtil(POOL_ANALYSIS)
sql_insert = """INSERT INTO report (name, suffix, file_count, file_size, status, create_time, update_time)
VALUES (%s, %s, %s, %s, %s, %s, %s)"""
sql_util_source.insert_data(sql_insert, (ver, ver, f_count, f_size, 2,
datetime.datetime.now(), datetime.datetime.now()))
if __name__ == '__main__':
start_time = time.time()
version = "HD20220910T01_03_11" # 获取版本号
# insert_svn_info() # 记录引用计数
tables = get_all_table() # 获取DB的所有表名
# version = get_mysql_db()
file_count, file_size = 0, 0
for i in all_table_list:
flag = 1
(key, value), = i.items()
for j in tables: # 此处有优化空间, 判断是此表在DB是否存在
if key + "_" + version == j[0]:
flag = 0
break
if flag:
create_db(create_table=value, version=version)
if key == "SvnFileInfo":
file_count, file_size = insert_svn_info(version)
else:
insert_db(table=key, version=version)
modify_report(version, file_count, file_size)
end_time = time.time()
print("耗时: {:.2f}秒".format(end_time - start_time)) # 487 seconds
Client-DBeaver
WebUI
http://your_ip:your_port/play
嵌入自己的账号和密码就可以使用了