添加依赖:
<dependencies>
<!-- Apache Flink dependencies -->
<!-- These dependencies are provided, because they should not be packaged into the JAR file. -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- <dependency>-->
<!-- <groupId>org.apache.flink</groupId>-->
<!-- <artifactId>flink-statebackend-rocksdb_${scala.binary.version}</artifactId>-->
<!-- <version>${flink.version}</version>-->
<!-- </dependency>-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-scala-bridge_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-common</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner-blink_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/ru.yandex.clickhouse/clickhouse-jdbc -->
<dependency>
<groupId>ru.yandex.clickhouse</groupId>
<artifactId>clickhouse-jdbc</artifactId>
<version>0.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-jdbc_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- Add logging framework, to produce console output when running in the IDE. -->
<!-- These dependencies are excluded from the application JAR by default. -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.7</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
<scope>runtime</scope>
</dependency>
</dependencies>
示例
本示例使用Kafka connector,通过Flink将Kafka数据实时导入到ClickHouse
public class FlinkSinkClickHouse {
public static void main(String[] args) throws Exception {
String url = "jdbc:clickhouse://192.168.10.203:8123/default";
String user = "default";
String passwd = "hOn0d9HT";
String driver = "ru.yandex.clickhouse.ClickHouseDriver";
int batchsize = 500; // 设置batch size,测试的话可以设置小一点,这样可以立刻看到数据被写入
// 创建执行环境
EnvironmentSettings settings = EnvironmentSettings
.newInstance()
.useBlinkPlanner()
.inStreamingMode()
.build();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment tEnv = StreamTableEnvironment.create(env, settings);
String kafkaSource11 = "" +
"CREATE TABLE user_behavior ( " +
" `user_id` BIGINT, -- 用户id\n" +
" `item_id` BIGINT, -- 商品id\n" +
" `cat_id` BIGINT, -- 品类id\n" +
" `action` STRING, -- 用户行为\n" +
" `province` INT, -- 用户所在的省份\n" +
" `ts` BIGINT, -- 用户行为发生的时间戳\n" +" `proctime` AS PROCTIME(), -- 通过计算列产生一个处理时间列\n" +
" `eventTime` AS TO_TIMESTAMP(FROM_UNIXTIME(ts, 'yyyy-MM-dd HH:mm:ss')), -- 事件时间\n" +
" WATERMARK FOR eventTime AS eventTime - INTERVAL '5' SECOND -- 在eventTime上定义watermark\n" +
") WITH ( 'connector' = 'kafka', -- 使用 kafka connector\n" +
" 'topic' = 'user_behavior', -- kafka主题\n" +
" 'scan.startup.mode' = 'earliest-offset', -- 偏移量,从起始 offset 开始读取\n" +
" 'properties.group.id' = 'group1', -- 消费者组\n" +
" 'properties.bootstrap.servers' = 'kms-2:9092,kms-3:9092,kms-4:9092', -- kafka broker 地址\n" +
" 'format' = 'json', -- 数据源格式为 json\n" +
" 'json.fail-on-missing-field' = 'true',\n" +
" 'json.ignore-parse-errors' = 'false'" +
")";
// Kafka Source
tEnv.executeSql(kafkaSource11);
String query = "SELECT user_id,item_id,cat_id,action,province,ts FROM user_behavior";
Table table = tEnv.sqlQuery(query);
String insertIntoCkSql = "INSERT INTO behavior_mergetree(user_id,item_id,cat_id,action,province,ts)\n" +
"VALUES(?,?,?,?,?,?)";
//将数据写入 ClickHouse Sink
JDBCAppendTableSink sink = JDBCAppendTableSink
.builder()
.setDrivername(driver)
.setDBUrl(url)
.setUsername(user)
.setPassword(passwd)
.setQuery(insertIntoCkSql)
.setBatchSize(batchsize)
.setParameterTypes(Types.LONG, Types.LONG,Types.LONG, Types.STRING,Types.INT,Types.LONG)
.build();
String[] arr = {"user_id","item_id","cat_id","action","province","ts"};
TypeInformation[] type = {Types.LONG, Types.LONG,Types.LONG, Types.STRING,Types.INT,Types.LONG};
tEnv.registerTableSink(
"sink",
arr,
type,
sink
);
tEnv.insertInto(table, "sink");
tEnv.execute("Flink Table API to ClickHouse Example");
}
}
Note:
- 由于 ClickHouse 单次插入的延迟比较高,我们需要设置
BatchSize
来批量插入数据,提高性能。 - 在 JDBCAppendTableSink 的实现中,若最后一批数据的数目不足
BatchSize
,则不会插入剩余数据。