本文分享主要是ClickHouse的数据导入方式,本文主要介绍如何使用Flink、Spark、Kafka、MySQL、Hive将数据导入ClickHouse,具体内容包括:
-
使用Flink导入数据
-
使用Spark导入数据
-
从Kafka中导入数据
-
从MySQL中导入数据
-
从Hive中导入数据
使用Flink导入数据
本文介绍使用 flink-jdbc将数据导入ClickHouse,Maven依赖为:
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-jdbc_${scala.binary.version}</artifactId>
<version>1.10.1</version>
</dependency>
示例
本示例使用Kafka connector,通过Flink将Kafka数据实时导入到ClickHouse
public class FlinkSinkClickHouse {
public static void main(String[] args) throws Exception {
String url = "jdbc:clickhouse://192.168.10.203:8123/default";
String user = "default";
String passwd = "hOn0d9HT";
String driver = "ru.yandex.clickhouse.ClickHouseDriver";
int batchsize = 500; // 设置batch size,测试的话可以设置小一点,这样可以立刻看到数据被写入
// 创建执行环境
EnvironmentSettings settings = EnvironmentSettings
.newInstance()
.useBlinkPlanner()
.inStreamingMode()
.build();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment tEnv = StreamTableEnvironment.create(env, settings);
String kafkaSource11 = "" +
"CREATE TABLE user_behavior ( " +
" `user_id` BIGINT, -- 用户id\n" +
" `item_id` BIGINT, -- 商品id\n" +
" `cat_id` BIGINT, -- 品类id\n" +
" `action` STRING, -- 用户行为\n" +
" `province` INT, -- 用户所在的省份\n" +
" `ts` BIGINT, -- 用户行为发生的时间戳\n" +
" `proctime` AS PROCTIME(), -- 通过计算列产生一个处理时间列\n" +
" `eventTime` AS TO_TIMESTAMP(FROM_UNIXTIME(ts, 'yyyy-MM-dd HH:mm:ss')), -- 事件时间\n" +
" WATERMARK FOR eventTime AS eventTime - INTERVAL '5' SECOND -- 在eventTime上定义watermark\n" +
") WITH ( 'connector' = 'kafka', -- 使用 kafka connector\n" +
" 'topic' = 'user_behavior', -- kafka主题\n" +
" 'scan.startup.mode' = 'earliest-offset', -- 偏移量,从起始 offset 开始读取\n" +
" 'properties.group.id' = 'group1', -- 消费者组\n" +
" 'properties.bootstrap.servers' = 'kms-2:9092,kms-3:9092,kms-4:9092', -- kafka broker 地址\n" +
" 'format' = 'json', -- 数据源格式为 json\n" +
" 'json.fa