Clickhouse定时归档脚本

Clickhouse定时归档脚本


clickhouse官方文档: https://clickhouse.com/docs


1. Clickhouse功能脚本

#!/bin/bash


PARTITION_SUPPORT_OPERATE=("DETACH" "DROP" "DROP DETACHED" "ATTACH" "FREEZE" "UNFREEZE")
ck_part_ctl() {  
    operate=$(echo "$1" | tr '[:lower:]' '[:upper:]')
    database=$2 
    table=$3  
    partition=$4  

    # 检查参数是否为空  
    if [ -z "$operate" ] || [ -z "$database" ] || [ -z "$table" ] || [ -z "$partition" ]; then  
        echo "Error: One or more of the required parameters (operate,database, table, partition) are empty."  
        return 400  
    fi 

    # 检查操作类型是否正确
    operate_valid=0
    for item in "${PARTITION_SUPPORT_OPERATE[@]}"; do  
    if [ "$item" == "$operate" ]; then  
        operate_valid=1
        break  
    fi  
    done 
    if [ "$operate_valid" == "0" ]; then
        echo "Error: operate is not valid, suppprt operate are : ${PARTITION_SUPPORT_OPERATE[@]}." 
        return 400    
    fi  

     
  
    # 执行ClickHouse命令  
    clickhouse-client --query="ALTER TABLE ${database}.${table} ${operate} PARTITION '${partition}'"  
  
    # 检查ClickHouse命令的返回值  
    if [ $? -ne 0 ]; then  
        echo "Error: Failed to ${operate} partition '${partition}' to table '${database}.${table}'."  
        return 1  
    fi  
  
    echo "Partition '${partition}' ${operate} successfully to table '${database}.${table}'."  
}  
  

ck_backup() {  
    database=$1 
    table=$2  
    partition=$3  
    partition_file_prefix=$(echo "$partition" | tr -d '-')

    # 备份服务器 
    remote_host=$4
    remote_user=$5
    if [ -z "$remote_host" ]; then  
        remote_host=$GLOBAL_REMOTE_HOST 
    fi 
    if [ -z "$remote_user" ]; then  
        remote_user=$GLOBAL_REMOTE_USER
    fi 
   
    # 检查参数是否为空  
    if [ -z "$database" ] || [ -z "$table" ] || [ -z "$partition" ] || [ -z "$remote_host" ] || [ -z "$remote_user" ]; then  
        echo "Error: One or more of the required parameters (database, table, partition, remote_host, remote_user) are empty."  
        return 400  
    fi  

    # ck home
    clickhouse_home=$GLOBAL_CLICKHOUSE_HOME
    if [ -z "$clickhouse_home" ]; then  
        echo "Error: GLOBAL_CLICKHOUSE_HOME env variable is required."  
        return 400  
    fi 


    # 远程服务是否可连接
    remote_backup_path=$GLOBAL_BACKUP_PATH
    if [ -z "$remote_backup_path" ]; then  
        echo "Error: GLOBAL_BACKUP_PATH env variable is required."  
        return 400  
    fi 
    remote_backup_path=$remote_backup_path/${database}/${table}/$partition_file_prefix  
    ssh -o ConnectTimeout=5 $remote_user@$remote_host "exit"
    if [ $? -ne 0 ]; then  
        echo "Error: Can't connect remote server $remote_host with user $remote_user."  
        return 502
    fi


    # 执行ck detach命令 
    ck_part_ctl "DETACH" "$database" "$table" "$partition"
    if [ $? -ne 0 ]; then  
        return 1  
    fi  
  

    # 检查是否有备份数据
    back_dir_cnt=$(find $clickhouse_home/data/${database}/${table}/detached/ -maxdepth 1 -name "$partition_file_prefix"* | wc -l)
    if [ $back_dir_cnt -eq 0 ] ; then  
        echo "There is a empty data collection in ${database}.${table} with partition $partition."  
        return 0
    fi  

    # 临时工作空间
    work_space_dir=${GLOBAL_WORK_SPACE_DIR:-/tmp}  
    work_space=$work_space_dir/backup/$database/$table/$partition_file_prefix
    rm -rf "$work_space" && mkdir -p "$work_space" && cd "$work_space" || {  
        echo "Failed to create or enter work space directory."  
        return 1  
    } 

    # 数据压缩
    cp -r $clickhouse_home/data/${database}/${table}/detached/$partition_file_prefix* .
    back_tar_file=$partition_file_prefix-$(date '+%s').tar.gz
    tar czf $back_tar_file $partition_file_prefix*
    cd -

    # 数据传输    
    ssh -o ConnectTimeout=5 $remote_user@$remote_host "mkdir -p $remote_backup_path"
    scp -o ConnectTimeout=5 $work_space/$back_tar_file $remote_user@$remote_host:$remote_backup_path/$back_tar_file
    if [ $? -ne 0 ]; then  
        echo "Error: scp $work_space/$back_tar_file to remote server $remote_user@$remote_host:$remote_backup_path/$back_tar_file , failed"  
        rm -rf $work_space 
        ck_part_ctl "ATTACH" "$database" "$table" "$partition"
        return 502
    fi
    
    # 删除数据
    rm -rf $work_space && rm -rf $clickhouse_home/data/${database}/${table}/detached/$partition_file_prefix*
    
    echo "Backup ${database}.${table} with partition '${partition}' to remote server $remote_user@$remote_host success."  
}  


ck_range_backup() {
    database=$1
    table=$2
    partition_begin=$3
    partition_end=$4

    # 检查参数是否为空
    if [ -z "$database" ] || [ -z "$table" ] || [ -z "$partition_begin" ] || [ -z "$partition_end" ]; then
        echo "Error: One or more of the required parameters (operate,database, table, partition_begin, partition_end) are empty."
        return 400
    fi

        # 查询所有partition
    partition_list=$(clickhouse-client --query="select distinct partition from  system.parts where database = '${database}' and table = '${table}'
 and partition between '${partition_begin}' and '${partition_end}' order by partition")
    partition_arr=($partition_list)

    for partition in "${partition_arr[@]}"; do
        ck_backup "${database=$1}" "${table}" "${partition}" 
    done
} 

ck_restore() {
    database=$1
    table=$2
    partition=$3
    partition_file_prefix=$(echo "$partition" | tr -d '-')

    # 备份服务器
    remote_host=$4
    remote_user=$5
    if [ -z "$remote_host" ]; then
        remote_host=$GLOBAL_REMOTE_HOST
    fi
    if [ -z "$remote_user" ]; then
        remote_user=$GLOBAL_REMOTE_USER
    fi

    # 检查参数是否为空
    if [ -z "$database" ] || [ -z "$table" ] || [ -z "$partition" ] || [ -z "$remote_host" ] || [ -z "$remote_user" ]; then
        echo "Error: One or more of the required parameters (database, table, partition, remote_host, remote_user) are empty."
        return 400
    fi

    # ck home
    clickhouse_home=$GLOBAL_CLICKHOUSE_HOME
    if [ -z "$clickhouse_home" ]; then
        echo "Error: GLOBAL_CLICKHOUSE_HOME env variable is required."
        return 400
    fi


    # 远程服务是否可连接
    remote_backup_path=$GLOBAL_BACKUP_PATH
    if [ -z "$remote_backup_path" ]; then
        echo "Error: GLOBAL_BACKUP_PATH env variable is required."
        return 400
    fi

    remote_backup_path=$remote_backup_path/${database}/${table}/$partition_file_prefix
    ssh -o ConnectTimeout=5 $remote_user@$remote_host "exit"
    if [ $? -ne 0 ]; then
        echo "Error: Can't connect remote server $remote_host with user $remote_user."
        return 502
    fi

    # 临时工作空间
    work_space_dir=${GLOBAL_WORK_SPACE_DIR:-/tmp}
    work_space=$work_space_dir/backup/$database/$table/$partition_file_prefix
    rm -rf "$work_space" && mkdir -p "$work_space" && cd "$work_space" || {
        echo "Failed to create or enter work space directory."
        return 1
    }

    scp -o ConnectTimeout=5 $remote_user@$remote_host:$remote_backup_path/*.gz . >/dev/null 2>&1 || true
    # 检查是否有备份数据
    back_tar_cnt=$(find . -maxdepth 1 -name '*.gz' | wc -l)
    if [ $back_tar_cnt -eq 0 ] ; then
        echo "There is a empty backup in ${database}.${table} with partition $partition."
        return 0
    fi

    # 解压
    for tarFile in $work_space/*; do
        tar xf $tarFile -C $clickhouse_home/data/${database}/${table}/detached/
    done
    cd -- >/dev/null && rm -rf $work_space

    chown -R clickhouse:clickhouse $clickhouse_home/data/${database}/${table}/detached/
    chmod -R 755 $clickhouse_home/data/${database}/${table}/detached/
    # 执行ck attach命令
    ck_part_ctl "ATTACH" "$database" "$table" "$partition"
    if [ $? -ne 0 ]; then
        return 1
    fi


    echo "Restore ${database}.${table} with partition '${partition}' from remote server $remote_user@$remote_host success."
}

脚本功能:
ck_part_ctl:支持对partition进行DETACH、DROP、FREEZE等操作
参数1: 操作,支持PARTITION_SUPPORT_OPERATE数组中的操作类型,大小写不敏感
参数2: 数据库名
参数3: 表名
参数4: 表分片,大部分日期业务分片如:1970-01-01,也有系统表按月分片202001.

ck_backup: 备份clickhouse数据到远程服务器
参数1: 数据库名
参数2: 表名
参数3: 表分片,大部分日期业务分片如:1970-01-01,也有系统表按月分片202001.
参数4: 远程服务器host(可选,如果不传,读取GLOBAL_REMOTE_HOST)
参数5: 远程服务器user(可选,如果不传,读取GLOBAL_REMOTE_USER)
依赖环境变量GLOBAL_CLICKHOUSE_HOME:clickhouse服务磁盘地址,例如/data/clickhouse
依赖环境变量GLOBAL_BACKUP_PATH:远程服务器的备份地址,列如/data/clickhouse/backup
依赖环境变量 GLOBAL_WORK_SPACE_DIR:临时操作目录需提前创建,例如/data/clickhouse/tmp(可选,默认/tmp)

ck_range_backup: 依赖ck_backup方法,传入一个partition范围备份
参数1: 数据库名
参数2: 表名
参数3: 分片范围最小值
参数4: 分片范围最大值
依赖环境变量GLOBAL_REMOTE_HOST: 远程服务地址host
依赖环境变量GLOBAL_REMOTE_USER: 远程服务地址用户
依赖环境变量GLOBAL_CLICKHOUSE_HOME:clickhouse服务磁盘地址,例如/data/clickhouse
依赖环境变量GLOBAL_BACKUP_PATH:远程服务器的备份地址,列如/data/clickhouse/backup
依赖环境变量 GLOBAL_WORK_SPACE_DIR:临时操作目录需提前创建,例如/data/clickhouse/tmp(可选,默认/tmp)

ck_restore: 备份还原,从远程备份服务器拉取数据存于本地clickhouse
参数1: 数据库名
参数2: 表名
参数3: 表分片,大部分日期业务分片如:1970-01-01,也有系统表按月分片202001.
参数4: 远程服务器host(可选,如果不传,读取GLOBAL_REMOTE_HOST)
参数5: 远程服务器user(可选,如果不传,读取GLOBAL_REMOTE_USER)
依赖环境变量GLOBAL_CLICKHOUSE_HOME:clickhouse服务磁盘地址,例如/data/clickhouse
依赖环境变量GLOBAL_BACKUP_PATH:远程服务器的备份地址,列如/data/clickhouse/backup
依赖环境变量 GLOBAL_WORK_SPACE_DIR:临时操作目录需提前创建,例如/data/clickhouse/tmp(可选,默认/tmp)

2 定时归档任务脚本

#!/bin/bash
export GLOBAL_REMOTE_HOST=192.160.1.1
export GLOBAL_REMOTE_USER=root
export GLOBAL_CLICKHOUSE_HOME=/data/clickhouse
export GLOBAL_BACKUP_PATH=/data/clickhouse/backup
export GLOBAL_WORK_SPACE_DIR=/data/clickhouse/tmpworkspace

base_dir=$(dirname "$0") 
source $base_dir/clickhousectl.sh

# 备份日期,当前时间减1年零1天
archive_date=$(date -d "1 year ago 1 day ago" "+%Y-%m-%d")
archive_tables=(table1 table2 table3 table4)
archive_db=test_db

for archive_table in "${archive_tables[@]}"; do
  ck_backup "$archive_db" "$archive_table" "$archive_date"
done

加入定时任务
crontab -e

23 * * * *  /data/clickhouse/script/ck_archive.sh >> /data/clickhouse/script/archive.log 
引用网络文章开启本课程的开篇: 在大数据分析领域中,传统的大数据分析需要不同框架和技术组合才能达到最终的效果,在人力成本,技术能力和硬件成本上以及维护成本让大数据分析变得成为昂贵的事情。让很多中小型企业非常苦恼,不得不被迫租赁第三方大型公司的数据分析服务。  ClickHouse开源的出现让许多想做大数据并且想做大数据分析的很多公司和企业耳目一新。ClickHouse 正是以不依赖Hadoop 生态、安装和维护简单、查询速度快、可以支持SQL等特点在大数据分析领域越走越远。  本课程采用全新的大数据技术栈:Flink+ClickHouse,让你体验到全新技术栈的强大,感受时代变化的气息,通过学习完本课程可以节省你摸索的时间,节省企业成本,提高企业开发效率。本课程不仅告诉你如何做项目,还会告诉你如何验证系统如何支撑亿级并发,如何部署项目等等。希望本课程对一些企业开发人员和对新技术栈有兴趣的伙伴有所帮助,如对我录制的教程内容有建议请及时交流。 课程概述:在这个数据爆发的时代,像大型电商的数据量达到百亿级别,我们往往无法对海量的明细数据做进一步层次的预聚合,大量的业务数据都是好几亿数据关联,并且我们需要聚合结果能在秒级返回。  那么我们该如何实现这一需求呢?基于Flink+ClickHouse构建电商亿级实时数据分析平台课程,将带领大家一步一步从无到有实现一个高性能的实时数据分析平台,该系统以热门的互联网电商实际业务应用场景为案例讲解,对电商数据的常见实战指标以及难点实战指标进行了详尽讲解,具体指标包括:概况统计、全站流量分析、渠道分析、广告分析、订单分析、运营分析(团购、秒杀、指定活动)等,该系统指标分为分钟级和小时级多时间方位分析,能承载海量数据的实时分析,数据分析涵盖全端(PC、移动、小程序)应用。 本课程凝聚讲师多年一线大数据企业实际项目经验,大数据企业在职架构师亲自授课,全程实操代码,带你体验真实的大数据开发过程,代码现场调试。通过本课程的学习再加上老师的答疑,你完全可以将本案例直接应用于企业。 本套课程可以满足世面上绝大多数大数据企业级的海量数据实时分析需求,全部代码在老师的指导下可以直接部署企业,支撑千亿级并发数据分析。项目代码也是具有极高的商业价值的,大家可以根据自己的业务进行修改,便可以使用。  本课程包含的技术: 开发工具为:IDEA、WebStorm Flink1.9.0 ClickHouseHadoop2.6.0 Hbase1.0.0 Kafka2.1.0 Hive1.0.0 Jmeter(验证如何支撑亿级并发)Docker (虚拟化部署)HDFS、MapReduce Zookeeper SpringBoot2.0.2.RELEASE SpringCloud Finchley.RELEASE Binlog、Canal MySQL Vue.js、Nodejs Highcharts Linux Shell编程  课程亮点: 1.与企业对接、真实工业界产品 2.ClickHouse高性能列式存储数据库 3.大数据热门技术Flink新版本 4.Flink join 实战 5.Flink 自定义输出路径实战 6.全链路性能压力测试 7.虚拟化部署 8.集成指标明细查询 9.主流微服务后端系统 10.分钟级别与小时级别多时间方位分析 11.数据库实时同步解决方案 12.涵盖主流前端技术VUE+jQuery+Ajax+NodeJS 13.集成SpringCloud实现统一整合方案 14.互联网大数据企业热门技术栈 15.支持海量数据的实时分析 16.支持全端实时数据分析 17.全程代码实操,提供全部代码和资料 18.提供答疑和提供企业技术方案咨询 企业一线架构师讲授,代码在老师的指导下企业可以复用,提供企业解决方案。  版权归作者所有,盗版将进行法律维权。 
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值