Flink消费Kafka插入Clickhouse
一、Maven依赖
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_2.11</artifactId>
<version>1.13.1</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>1.13.1</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.11</artifactId>
<version>1.13.1</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>3.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_2.11</artifactId>
<version>1.13.1</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-simple</artifactId>
<version>1.7.36</version>
</dependency>
<!-- json解析 -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>2.0.3</version>
</dependency>
<dependency>
<groupId>ru.yandex.clickhouse</groupId>
<artifactId>clickhouse-jdbc</artifactId>
<version>0.3.2</version>
</dependency>
二、Job类
public static void main(String[] args) {
try {
Properties properties = new Properties();
properties.setProperty(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BROKERS);
properties.setProperty(ConsumerConfig.GROUP_ID_CONFIG, GROUP_ID);
properties.put("flink.partition-discovery.interval-millis", INTERVAL_MILLIS);
properties.put("refresh.leader.backoff.ms", REFRESH_MS);
KafkaSource<String> kafkaSource = KafkaSource.<String>builder()
.setTopics(TOPIC)
.setValueOnlyDeserializer(new SimpleStringSchema())
.setProperties(properties)
.setStartingOffsets(OffsetsInitializer.latest())
.build();
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.enableCheckpointing(120000L, CheckpointingMode.EXACTLY_ONCE);
env.getCheckpointConfig().setMinPauseBetweenCheckpoints(180000L);
env.getConfig().setAutoWatermarkInterval(0);
env.fromSource(kafkaSource, WatermarkStrategy.noWatermarks(), "kafkaSource_sourceName")
.uid("kafkaSource_sourceName_uid").name("kafkaSource_sourceName_name").setParallelism(1)
.flatMap(new KafkaSourceFlatMap()).uid("KafkaSourceFlatMap_uid").name("KafkaSourceFlatMap_name").setParallelism(1)
.flatMap(new ClickHouseSinkFlatMap(Constants.PARALL)).uid("ClickHouseSinkFlatMap_uid").name("ClickHouseSinkFlatMap_name").setParallelism(1);
env.execute("KafkaToClickHouseJob");
} catch (Exception e) {
LOG.error("KafkaToClickHouseJob启动失败!", e);
}
}
二、Kafka FlatMap算子
public class KafkaSourceFlatMap extends RichFlatMapFunction<String, Map<String, String>> {
private static final Logger log = LoggerFactory.getLogger(KafkaSourceFlatMap.class);
@Override
public void flatMap(String str, Collector<Map<String, String>> out) {
try {
String user_id;
String event_type;
String label_value;
String recv_date;
String recv_time;
out.collect(listToMap(user_id.trim(),
event_type.trim(),
label_value.trim(),
recv_time));
}
public Map<String, String> listToMap(String... str) {
Map<String, String> resMap = new HashMap<>();
for (int i = 0; i < str.length; i++) {
resMap.put((i + 1) + Constants.NULL, str[i]);
}
return resMap;
}
}
三、Clickhouse FlatMap算子
public class ClickHouseSinkFlatMap extends RichFlatMapFunction<Map<String, String>, String> {
private static final Logger LOG = LoggerFactory.getLogger(ClickHouseSinkFlatMap.class);
private int batch_size;
private int count = 0;
private int total = 0;
private ClickHouseConnection connection = null;
private PreparedStatement preparedStatement = null;
public ClickHouseSinkFlatMap(int batch_size) {
this.batch_size = batch_size;
}
@Override
public void flatMap(Map<String, String> map, Collector<String> collector) {
try {
if (connection == null) {
connection = getConn();
preparedStatement = connection.prepareStatement(Constants.INSERT_SQL);
}
for (int i = 1; i < map.size()+1; i++) {
preparedStatement.setString(i, map.get(i + Constants.NULL));
}
preparedStatement.addBatch();
count = count + 1;
try {
if (count >= batch_size) {
preparedStatement.executeBatch();
total += count;
LOG.info("已经成功插入{}条数据!!!",total);
preparedStatement.clearBatch();
count = 0;
}
} catch (Exception ee) {
LOG.error("数据插入clickhouse 报错:", ee);
}
} catch (Exception ex) {
LOG.error("ClickhouseSink插入报错====", ex);
}
}
public static ClickHouseConnection getConn() {
int socketTimeout = 600000;
ClickHouseProperties properties = new ClickHouseProperties();
properties.setUser(Constants.USERNAME);
properties.setPassword(Constants.PASSWORD);
properties.setDatabase(Constants.DATABASE);
properties.setMaxRetries(3);
properties.setSocketTimeout(socketTimeout);
ClickHouseDataSource clickHouseDataSource = new ClickHouseDataSource(Constants.URL, properties);
ClickHouseConnection conn = null;
try {
conn = clickHouseDataSource.getConnection();
return conn;
} catch (SQLException e) {
}
return null;
}
}
四、Clickhouse建表
CREATE TABLE IF NOT EXISTS rawdata.user_tag_local ON CLUSTER default
(
user_id String,
event_type String,
label_value String,
recv_date Date,
recv_time DateTime64(3, 'Asia/Istanbul')
)
ENGINE = MergeTree()
PARTITION BY recv_date
ORDER BY recv_time
CREATE TABLE IF NOT EXISTS test.user_tag ON CLUSTER default
AS test.user_tag_local
Engine = Distributed("default" , "test" , "user_tag_local" , rand());