Clickhouse:分区和数值化优化实测

在前期CK尝试中,对bar分钟线的数据并没有分区和字符串数据的处理。本次拟在这两个方面进行优化。
优化1: 关于分区。按每个股的数据进行分区,目前分区的粒度偏细,这样宏观上如果有22亿条数据,会分出的区就会近4000块。

一、个股分区优化尝试

数据说明:已经对个股进行了分区。
在这里插入图片描述
1、代码

建表:my_db.stock_tb

CREATE TABLE stock_tb
(
    `code` String,  
    `datetime` DateTime,
    `open` Float32,
    `close` Float32,
    `low` Float32,
    `high` Float32,
    `volume` Float64,
    `money` Float64,
    `factor` Float32,
    `high_limit` Float32,
    `low_limit` Float32,
    `avg` Float32,
    `pre_close` Float32,
    `paused` Float32,
    `open_interest` Float64
)
ENGINE = MergeTree
PARTITION BY code
ORDER BY datetime 

并对code进行分区。暂时,code还是string.

from clickhouse_driver import Client
import pandas as pd
import os
from datetime import datetime, date
import time
import math

def get_all_files_by_root_sub_dirs(directory, file_type):
   data = list()
   if os.path.isdir(directory):  # 是目录
      dir_list = os.walk(directory)  # os.listdir(directory)
      for (root, sub, files) in dir_list:
          for file in files:
             path = os.path.join(root, file)  # root +'\\'+file
             if path.endswith(file_type):
                data.append(path)
   else:  # 不是目录,直接输出
      if directory.endswith(file_type):
          data.append(directory)
   return data
def get_code_from_csv_file(file):
    # .csv; .h5 file
    # D:\join_quant_data\futures\minute\A.XDCE\A1909.XDCE_2019-07-25_2019-08-12.CSV
    # D:\join_quant_data\futures\minute\A.XDCE.h5
    s = os.path.basename(file)  # A1909.XDCE_2019-07-25_2019-08-12.CSV
    sp = s.split('_')[0]

    if sp.endswith(".csv") or sp.endswith(".CSV"):
        code = sp[:-4]
    else:
        code = sp
    return code  # A1909.XDCE



class Code:
    def __init__(self,header,body,source,type):
        self.header = header
        self.body = body
        self.source = source
        self.type = type ##
    def get_simplecode(self):
        return self.header
    def get_int_code(self,datasource):
        return jq_int_code(self)
    
    def get_asset_type(self):
        return self.type
    def get_fullcode(self,datasource): 
        pass
            
def jq_int_code(code):
    if code.source =="XSHG":
        return 10000000+int(code.body)
    elif code.source == "XSHE":
        return 20000000+int(code.body)
    else:
        return 0
def split_str_code(str_code):
    return str_code.split('.', 1 )

def insert_data():
    client = Client('localhost')
    database_name = "my_db"
    table_name = 'stock_tb'
    dir_path = "/mnt/d/join_quant_data_stock_product/stock/minute/"
    files = get_all_files_by_root_sub_dirs(dir_path,".csv") 
    t0 = time.time()
    file_num = 0 
    for _file in files:
        t_file = time.time()
        print(f"{_file}  => 第{file_num}个文件, 总共:{len(files)}个!")
        block_insert_data = [] # 每个文件当批量insert的单元
        df = pd.read_csv(_file)
        fullcode = get_code_from_csv_file(_file)
        code_body,source = split_str_code(fullcode)
        code = Code(code_body,code_body,source,"stock")
        intcode = code.get_int_code()
        for row in df.itertuples():
            _row = list(row)[0:15]
            _row[0] = intcode                                         # code 要替换成Int方式 
            _row[1] = datetime.strptime(row[1],'%Y-%m-%d %H:%M:%S')   # datetime: 
            _row[2] = float(row.open)                                 # open: ,
            _row[3] = float(row.close)                                # close: ,
            _row[4] = float(row.low)                                  # low:    
            _row[5] = float(row.high)                                 # high: ,
            _row[6] = float(row.volume)                               # volume: ,
            _row[7] = float(row.money)                                # money: ,
            _row[8] = float(row.factor)                               # factor: ,
            _row[9] = float(row.high_limit)                           # high_limit:,
            _row[10] = float(row.low_limit)                           # low_limit:,
            _row[11] = float(row.avg)                                 # avg:  ,
            _row[12] = float(row.pre_close)                           # pre_close:,
            _row[13] = float(row.paused)                              # paused: ,
            if math.isnan(row.open_interest):
                _row[14] = 0.0 
            else:
                _row[14] = float(row.open_interest)                   #open_interest:
            block_insert_data.append(_row)
            # if file_num ==0:
            #     print(_row)
        # 逐条也可以insert json
        # sql = f"INSERT INTO {database_name}.{table_name} FORMAT JSONEachRow {json.dumps(row_data) * 1}"
        # 批量insert 
        client.execute(f'INSERT INTO {database_name}.{table_name}  VALUES', block_insert_data,types_check=True)
        table_info = client.execute(f'select count(1) from {database_name}.{table_name}')
        print(f"clickhouse stock_tb 表信息: {table_info}")
        print(f"第{file_num}个文件 总共:{len(files)}个 => {_file}读写完成! cost time:{time.time()-t_file}")
        file_num = file_num +1

    print(f"文件总共:{file_num}读写完成! cost time:{time.time()-t0}")

# SELECT * FROM stock_daily_price WHERE sec_code='600000.SZ' LIMIT 10;
    # AND toDate(EventDate) >= toDate('2013-01-29')
    # AND toDate(EventDate) <= toDate('2013-02-04')
def get_data_from_ch_by_code(): # startdate : 2010-01-01
    
    client = Client('localhost')
    database_name = "my_db"
    table_name = 'stock_tb'
    startdate = "2018-01-01 00:00:00"
    enddate = "2022-02-02 00:00:00"
    code = "600036.XSHG"
    start_date = datetime.strptime(startdate,'%Y-%m-%d %H:%M:%S') 
    end_date = datetime.strptime(enddate,'%Y-%m-%d %H:%M:%S') 

    query_sql_1 = f"SELECT * FROM {database_name}.{table_name} WHERE code = '{code}' "
    data = client.execute(query_sql_1)

    print(f"data : {len(data)}")
    
    return data

def get_data_from_ch_by_code_and_datetime():
    client = Client('localhost')
    database_name = "my_db"
    table_name = 'stock_tb'
    startdate = "2010-01-01"
    enddate = "2022-02-02"
    code = "600036.XSHG"
    query_sql = f"SELECT * FROM {database_name}.{table_name} WHERE code ='{code}' AND toDate(datetime) >= toDate('{startdate}')  AND toDate(datetime) <= toDate('{enddate}') "
    print(query_sql)
    data = client.execute(query_sql)
    print(f"data : {len(data)}")
    return data

t0 = time.time()
mode  = 0 # 0: insert 1: read
if mode == 0:
    insert_data()
else:
    data = get_data_from_ch_by_code_and_datetime()
t1 = time.time()
print(f"get_data cost time : {t1-t0} s! ")

2、运行速度情况
(1) 单只个股查询

SELECT * FROM my_db.stock_tb WHERE code ='600036.XSHG' AND toDate(datetime) >= toDate('2010-01-01')  AND toDate(datetime) <= toDate('2022-02-02') 
data : 704640
get_data cost time : 0.8274369239807129 s! 

在这里插入图片描述可以看出,这个按个股分区的效果的确非常好!

[Running] python -u "/home/songroom/pyclick/click_test.py"
SELECT * FROM my_db.stock_tb WHERE code ='600036.XSHG' AND toDate(datetime) >= toDate('2021-01-01')  AND toDate(datetime) <= toDate('2022-02-02') 
data : 62880
get_data cost time : 0.08578109741210938 s! 

(2)跨个股(跨分区)查询

如果跨标的查询一年左右的数据:

[Running] python -u "/home/songroom/pyclick/click_test.py"
SELECT * FROM my_db.stock_tb WHERE  toDate(datetime) >= toDate('2021-01-01')  AND toDate(datetime) <= toDate('2022-02-02') 
Killed

减少数据量,到半年左右:

[Running] python -u "/home/songroom/pyclick/click_test.py"
SELECT * FROM my_db.stock_tb WHERE  toDate(datetime) >= toDate('2021-06-01')  AND toDate(datetime) <= toDate('2022-02-02') 
Killed

[Done] exited with code=137 in 143.032 seconds

减少数据量,每个月来取:

[Running] python -u "/home/songroom/pyclick/click_test.py"
SELECT * FROM my_db.stock_tb WHERE  toDate(datetime) >= toDate('2022-01-01')  AND toDate(datetime) <= toDate('2022-02-02') 
data : 15450000
get_data cost time : 19.066221475601196 s! 

3、分区的其它优化思考

按个股分区之后,对个股的查询速度会明显提升。但是,在跨个股查询时,速度明显下降。所以 分区有的安排是一个平衡。

二、分区和数值优化

在上面的基础上,进行第2次优化:
优化2:数值化。把个股的代码转化成int类型,减少字符串的比较操作。
删除原来stock_tb表,重建新表,新表如下:

CREATE TABLE stock_tb
(
    `code` int,  
    `datetime` DateTime,
    `open` Float32,
    `close` Float32,
    `low` Float32,
    `high` Float32,
    `volume` Float64,
    `money` Float64,
    `factor` Float32,
    `high_limit` Float32,
    `low_limit` Float32,
    `avg` Float32,
    `pre_close` Float32,
    `paused` Float32,
    `open_interest` Float64
)
ENGINE = MergeTree
PARTITION BY code
ORDER BY datetime 

对code进行分区,并对code改成int.
在这里插入图片描述分区信息:

SELECT 
    partition AS `分区`,
    sum(rows) AS `总行数`,
    formatReadableSize(sum(data_uncompressed_bytes)) AS `原始大小`,
    formatReadableSize(sum(data_compressed_bytes)) AS `压缩大小`,
    round((sum(data_compressed_bytes) / sum(data_uncompressed_bytes)) * 100, 0) AS `压缩率`
FROM system.parts
WHERE (database IN ('my_db')) AND (table IN ('stock_tb')) 
GROUP BY partition
ORDER BY partition ASC

在这里插入图片描述查看具体表的信息:

SELECT 
    column AS `字段名`,
    any(type) AS `类型`,
    formatReadableSize(sum(column_data_uncompressed_bytes)) AS `原始大小`,
    formatReadableSize(sum(column_data_compressed_bytes)) AS `压缩大小`,
    sum(rows) AS `行数`
FROM system.parts_columns
WHERE (database = 'my_db') AND (table = 'stock_tb')
GROUP BY column
ORDER BY column ASC

在这里插入图片描述(1)、单只个股的条件查询:
运行速度 :在这里插入图片描述整体上,在对特定字段进行分区后,再数值化效果基本不明显。

(2)、跨个股的条件查询:全库一年的数据查询
在这里插入图片描述可见,内存基本打满了。

[Running] python -u "/home/songroom/pyclick/click_test.py"
SELECT count(*) FROM my_db.stock_tb WHERE  toDate(datetime) >= toDate('2021-01-01')  AND toDate(datetime) <= toDate('2022-02-02') 
data : [(251644080,)]
get_data cost time : 0.8955323696136475 s! 

不仅是一年,半年也查不了。
在这里插入图片描述查行数:
在这里插入图片描述可以看到,跨个股查询非常吃内存,并且效率不高。估值与分区太细有关系。

替代方案:
不在CK内进行跨个股查询,CK只负责推送个股的数据到应用端,由应用端来负责整合。比如,从CK端查询个股一年的数据,大约0.1秒,500个股股大约60秒左右。

在这里插入图片描述
三、思考

根据个股字段进行分区,对个股的查询效率有大幅度的提升;但是对跨个股进行查询,效率非常低,效率损失可能与分区过细有关。目前的分区数达到了20亿行,3000个分区。
后面需要思考在架构设计上如何更优。

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值