1需求
需求:准备将一些明细数据存入clickhouse中,方便事件的在线检索。开发一个Flink的clickhouseSink;
flink版本 1.10 ,flink版本1.12 两个版本
ClickHouse的java api也是基于jdbc的接口来做的。所以按照 JDBCSinkFunction 继承RichSinkFunction来实现。并且还需要创建一个JDBCOutputFormat来用于真正的输出数据到clickhouse
2.代码
2.1JdbcSinkFunction 代码
package com.rongan.realtime.sink;
import org.apache.flink.api.common.functions.RuntimeContext;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.state.FunctionInitializationContext;
import org.apache.flink.runtime.state.FunctionSnapshotContext;
import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
public class JDBCSinkFunction<T> extends RichSinkFunction<T> implements CheckpointedFunction {
final RAJDBCOutputFormat<T> outputFormat;
public JDBCSinkFunction(RAJDBCOutputFormat outputFormat) {
this.outputFormat = outputFormat;
}
@Override
public void snapshotState(FunctionSnapshotContext context) {
outputFormat.flush();
}
@Override
public void initializeState(FunctionInitializationContext context) {
}
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
RuntimeContext ctx = getRuntimeContext();
outputFormat.setRuntimeContext(ctx);
outputFormat.open(ctx.getIndexOfThisSubtask(), ctx.getNumberOfParallelSubtasks());
}
@Override
public void close() throws Exception {
outputFormat.close();
super.close();
}
@Override
public void invoke(T value, Context context) throws Exception {
outputFormat.writeRecord(value);
}
}
2.2JDBCOutputFormat代码
package com.rongan.realtime.sink;
import org.apache.flink.api.java.io.jdbc.AbstractJDBCOutputFormat;
import org.apache.flink.api.java.io.jdbc.JDBCOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.lang.reflect.Field;
import java.sql.PreparedStatement;
import java.sql.SQLException;
public class RAJDBCOutputFormat<T> extends AbstractJDBCOutputFormat<T> {
private static final long serialVersionUID = 1L;
private static final Logger LOG = LoggerFactory.getLogger(JDBCOutputFormat.class);
private final String query;
private final int batchInterval;
private PreparedStatement upload;
private int batchCount = 0;
public RAJDBCOutputFormat(String username, String password, String drivername,
String dbURL, String query, int batchInterval) {
super(username, password, drivername, dbURL);
this.query = query;
this.batchInterval = batchInterval;
}
/**
* Connects to the target database and initializes the prepared statement.
*
* @param taskNumber The number of the parallel instance.
* @throws IOException Thrown, if the output could not be opened due to an
* I/O problem.
*/
@Override
public void open(int taskNumber, int numTasks) throws IOException {
try {
establishConnection();
upload = connection.prepareStatement(query);
} catch (SQLException sqe) {
throw new IllegalArgumentException("open() failed.", sqe);
} catch (ClassNotFoundException cnfe) {
throw new IllegalArgumentException("JDBC driver class not found.", cnfe);
}
}
@Override
public void writeRecord(T row) throws IOException {
try {
setRecordToStatement(upload, row);
upload.addBatch();
} catch (SQLException e) {
throw new RuntimeException("Preparation of JDBC statement failed.", e);
}
batchCount++;
if (batchCount >= batchInterval) {
// execute batch
flush();
}
}
private void setRecordToStatement(PreparedStatement ps, T obj) {
Field[] fields = obj.getClass().getDeclaredFields();
//跳过的属性计数
for (int i = 0; i < fields.length; i++) {
Field field = fields[i];
//设置私有属性可访问
field.setAccessible(true);
try {
//获取属性值
Object o = field.get(obj);
ps.setObject(i + 1, o);
} catch (Exception e) {
e.printStackTrace();
}
}
}
void flush() {
try {
upload.executeBatch();
batchCount = 0;
} catch (SQLException e) {
throw new RuntimeException("Execution of JDBC statement failed.", e);
}
}
int[] getTypesArray() {
return null;
}
/**
* Executes prepared statement and closes all resources of this instance.
*
* @throws IOException Thrown, if the input could not be closed properly.
*/
@Override
public void close() throws IOException {
if (upload != null) {
flush();
try {
upload.close();
} catch (SQLException e) {
LOG.info("JDBC statement could not be closed: " + e.getMessage());
} finally {
upload = null;
}
}
closeDbConnection();
}
}
2.3 demo 输入kafka数据往Clickhouse插入数据
click建表语句
create table t_rsd_incident(
id String,
src_ip String,
dest_ip String,
src_network_area String,
dest_network_area String,
incident_time String,
date String
) engine =MergeTree
partition by date
primary key (id)
order by (id)
demo
package com.rongan.realtime.demo
import java.util.{Date, Properties}
import com.rongan.realtime.bean.Incident
import com.rongan.realtime.sink.{JDBCSinkFunction, RAJDBCOutputFormat}
import com.rongan.realtime.util.{DateUtils, FlinkUtil}
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011
import org.apache.flink.table.api.{Table, TableSchema}
import org.apache.flink.table.api.scala._
import org.apache.flink.types.Row
import org.apache.kafka.clients.consumer.ConsumerConfig
object WordCount2 {
def main(args: Array[String]): Unit = {
//1.创建 flink 执行环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI()
val tableEnv: StreamTableEnvironment = FlinkUtil.initTableEnv(env)
//2.创建kafkasource
val props = new Properties()
props.setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.7.128:9092")
props.setProperty(ConsumerConfig.GROUP_ID_CONFIG, "TEST")
val consumer = new FlinkKafkaConsumer011[String]("demo1", new SimpleStringSchema(), props)
val kafkaDataStream: DataStream[String] = env.addSource(consumer).setParallelism(3) //创建kafkasourc流设置并行度为3
//3.转换
// kafkaDataStream.print()
tableEnv.createTemporaryView("kafkaStream", kafkaDataStream, 'message)
//4.执行sql
val table: Table = tableEnv.sqlQuery("select message as id,'192.168.1.1' as srcIp,'192.168.1.1' as destIp,'bsdf' as srcNetworkArea, 'sfse' as destNetworkArea," +
"" +
"DATE_FORMAT(LOCALTIMESTAMP,'yyyy-MM-dd HH:mm:ss' )as incidentTime from kafkaStream " +
"")
//4.转换为clickhouse表对应的实体bean类
val incidentStream: DataStream[Incident] = tableEnv.toAppendStream[Row](table)
.map(row => {
val id: String = row.getField(0).asInstanceOf[String]
val srcIp: String = row.getField(1).asInstanceOf[String]
val destIp: String = row.getField(2).asInstanceOf[String]
val srcNetworkArea: String = row.getField(3).asInstanceOf[String]
val destNetworkArea: String = row.getField(4).asInstanceOf[String]
val incidentTime: String = row.getField(5).asInstanceOf[String]
new Incident(id, srcIp, destIp, srcNetworkArea, destNetworkArea, incidentTime, DateUtils.targetFormat(new Date(), DateUtils.YYYY_MM_DD))
})
//5.创建SinkFunction
val format = new RAJDBCOutputFormat[Incident]("", "", "ru.yandex.clickhouse.ClickHouseDriver", "jdbc:clickhouse://192.168.7.123:8123/default"
, "insert into t_rsd_incident values(?,?,?,?,?,?,?)", 1)
val functionSinkFunction = new JDBCSinkFunction[Incident](format)
incidentStream.addSink(functionSinkFunction)
env.execute()
//5.执行任务
env.execute("job")
}
}
case class Incident(id: String, srcIp: String, destIp: String, srcNetworkArea: String, destNetworkArea: String, incidentTime: String, Date: String)