前言:it圈有很多etl工具,可以实现表的迁移工作,但是轻量行的,更加实用,这里我提python代码的方式来实现hive表迁移到CK(不需要额外在ck建表结构)
直接上代码:
# -*- coding: utf-8 -*-
"""
注意:分区表不支持在ck中建立分区表,分区表也能传
"""
from clickhouse_driver import Client
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
# 创建SparkSession
conf = SparkConf()
conf.setAppName('hive to ck')
sc = SparkContext(conf=conf)
hc = HiveContext(sc)
# Hive表名
database = "ads."
hive_table_name = "表名"
hive_table_name2 = f"{database}{hive_table_name}"
# ClickHouse连接信息
clickhouse_driver_name = "com.clickhouse.jdbc.ClickHouseDriver"
clickhouse_url = "jdbc:clickhouse://ip:8123/ads"
clickhouse_user = "default"
clickhouse_password = "用户密码"
# 从Hive表读取数据
df = hc.sql(f"SELECT * FROM {hive_table_name2}")
# 转换数据类型为String
df = df.selectExpr([f"CAST({col} AS STRING)" for col in df.columns])
# 获取Hive表的字段名和类型
hive_schema = hc.sql(f"DESCRIBE {hive_table_name2}")
hive_schema.show(5,True)
clickhouse_columns = []
for row in hive_schema.collect():
column_name = row['col_name']
column_type = 'String' # 在ClickHouse中将所有字段类型设置为String
clickhouse_columns.append(f"{column_name} {column_type}")
# 创建ClickHouse表
clickhouse_create_table_query = f"CREATE TABLE IF NOT EXISTS {hive_table_name} ({','.join(clickhouse_columns)}) ENGINE = MergeTree() ORDER BY {clickhouse_columns[0].split()[0]}"
clickhouse_conn = Client(
host="ck_ip",
port="9000",
database="ads",
user="default",
password="用户密码",
settings={'use_numpy': True}
)
clickhouse_conn.execute(clickhouse_create_table_query)
# 写入ClickHouse
df.write \
.format("jdbc") \
.option("driver", clickhouse_driver_name) \
.option("url", clickhouse_url) \
.option("dbtable", hive_table_name) \
.option("user", clickhouse_user) \
.option("password", clickhouse_password) \
.option("rewriteBatchedStatements", "true") \
.option("batchsize", "1000000") \
.mode("append") \
.save()
# 关闭SparkSession
sc.stop()
注意:用的是spark引擎,提交yarn任务即可实现迁移