一、背景
每天上百亿的日志数据实时查询是个挑战,在架构设计上采用了Kafka + Flink + Clickhouse+Redash,实现海量数据的实时分析。计算层,我们开发了基于Flink计算引擎的实时数据平台,简化开发流程,数据通过配置化实现动态Schema生成,底层数据解析统一,无需重复造轮子,整个数据链路,从数据的采集,转换,存储,可视化,无需写一行代码,配置化完成。本文主要介绍实时日志数据写入Clickhouse的实践。
Flink Clickhouse Sink
1 2 3 4 5 |
|
public class ClickhouseSink extends RichSinkFunction<Row> implements Serializable {
private String tablename;
private String[] tableColums;
private List<String> types;
private String[] columns;
private String username;
private String password;
private String[] ips;
private String drivername = "ru.yandex.clickhouse.ClickHouseDriver";
private List<Row> list = new ArrayList<>();
private List<PreparedStatement> preparedStatementList = new ArrayList<>();
private List<Connection> connectionList = new ArrayList<>();
private List<Statement> statementList = new ArrayList<>();
private long lastInsertTime = 0L;
private long insertCkTimenterval = 4000L;
// 插入的批次
private int insertCkBatchSize = 10000;
public ClickhouseSink(String tablename, String username, String password, String[] ips, String[] tableColums, List<String> types, String[] columns) {
this.tablename = tablename;
this.username = username;
this.password = password;
this.ips = ips;
this.tableColums = tableColums;
this.types = types;
this.columns = columns; // 新增字段
}
// 插入数据
public void insertData(List<Row> rows, PreparedStatement preparedStatement, Connection connection) throws SQLException {
for (int i = 0; i < rows.size(); ++i) {
Row row = rows.get(i);
for (int j = 0; j < this.tableColums.length; ++j) {
if (null != row.getField(j)) {
preparedStatement.setObject(j + 1, row.getField(j));