目录
需求:flink分批读clickhouse数据,处理后再存clickhouse。当然也可以按需读存其他地方。
流式数据批量插入。
1.maven依赖
<dependency>
<groupId>com.clickhouse</groupId>
<artifactId>clickhouse-jdbc</artifactId>
<version>0.3.2-patch5</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>1.14.6</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.11</artifactId>
<version>1.14.6</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_2.11</artifactId>
<version>1.14.6</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-runtime-web_2.11</artifactId>
<version>1.14.6</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge_2.11</artifactId>
<version>1.14.6</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner_2.11</artifactId>
<version>1.14.6</version>
</dependency>
2.main
import com.alibaba.fastjson.JSONObject;
import com.sumec.flink.entity.RuleDto;
import com.sumec.flink.entity.UserProductTagDto;
import com.sumec.flink.entity.UserRule;
import org.apache.commons.lang3.StringUtils;
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.restartstrategy.RestartStrategies;
import org.apache.flink.api.common.time.Time;
import org.apache.flink.shaded.guava30.com.google.common.collect.Lists;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
public class FlinkClickhouseDemo {
public static void main(String[] args) {
// 创建Flink执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(3, Time.of(10, TimeUnit.SECONDS)));
// 数据源,读clickhouse,并行度按需设置
DataStream<Map<String, String>> users = env.addSource(new ClickhouseSource()).setParallelism(1);
// transformation: map keyBy timeWindow apply
// 此处简单过滤和转换,按需设置并行度
SingleOutputStreamOperator<List<UserProductTagDto>> singleDto = users.filter(new FilterFunction<Map<String, String>>() {
@Override
public boolean filter(Map<String, String> map) throws Exception {
return filterData(map, ruleDto);
}
}).setParallelism(150).map(new MapFunction<Map<String, String>, List<UserProductTagDto>>() {
@Override
public List<UserProductTagDto> map(Map<String, String> stringStringMap) throws Exception {
return parse(stringStringMap, ruleDto.getAnalysisNo());
}
}).setParallelism(150);
// sink,写clickhouse或其他地方,按需设置并行度
userProductList.addSink(new ClickHouseSink()).setParallelism(32);
// execute
try {
env.execute();
} catch (Exception e) {
System.out.println(e.getMessage());
}
}
// transform
private static List<UserProductTagDto> parse(Map<String, String> map) throws Exception {
// 按需处理
}
public static boolean filterData(Map<String, String> map, RuleDto ruleDto) {
}
}
3.自定义source
存量数据:按天分批取数据处理
增量数据:每隔一段时间拉取一次数据
import com.clickhouse.client.internal.google.common.collect.Maps;
import lombok.extern.slf4j.Slf4j;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction;
import ru.yandex.clickhouse.ClickHouseConnection;
import ru.yandex.clickhouse.ClickHouseDataSource;
import ru.yandex.clickhouse.ClickHouseStatement;
import ru.yandex.clickhouse.settings.ClickHouseProperties;
import ru.yandex.clickhouse.settings.ClickHouseQueryParam;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.HashMap;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicInteger;
@Slf4j
public class ClickhouseSource extends RichParallelSourceFunction<Map<String, String>> {
private boolean flag = true;
private ClickHouseConnection conn = null;
private ClickHouseStatement stmt = null;
private ResultSet rs = null;
private Map<ClickHouseQueryParam, String> additionalDBParams = new HashMap<>();
// open只执行一次,适合开启资源
@Override
public void open(Configuration parameters) throws Exception {
String url = "jdbc:clickhouse://xxxxx:8123";
ClickHouseProperties properties = new ClickHouseProperties();
properties.setSessionId(UUID.randomUUID().toString());
// properties.setUser("default");
// properties.setPassword("");
properties.setDatabase("xxx");
ClickHouseDataSource dataSource = new ClickHouseDataSource(url, properties);
// ClickHouseProperties
additionalDBParams.put(ClickHouseQueryParam.SESSION_ID, UUID.randomUUID().toString());
conn = dataSource.getConnection();
stmt = conn.createStatement();
}
@Override
public void run(SourceContext<Map<String, String>> ctx) throws Exception {
String sql = "select * from xxx where event_time between '%s 00:00:00' and '%s 23:59:59' limit %s,%s";
String start = "2024-01-01";
String end = "2024-01-01";
int limitStart = 0;
int limit = 50000;
while (flag) {
String newsql = String.format(sql, start, end, limitStart, limit);
rs = stmt.executeQuery(newsql, additionalDBParams);
int c = 0;
while (rs.next()) {
c++;
Map<String, String> hm = Maps.newHashMap();
ResultSetMetaData rsmd = rs.getMetaData();
int count = rsmd.getColumnCount();// 获取列的数量
for (int i = 1; i <= count; i++) {
String key = rsmd.getColumnLabel(i);
String value = rs.getString(i);
hm.put(key, value);
}
ctx.collect(hm);
}
limitStart += c;
if (c < limit) {
// 查完了,换时间
limitStart = 0;
LocalDate localDate = LocalDate.parse(start, DateTimeFormatter.ofPattern("yyyy-MM-dd")).plusDays(1);
start = localDate.toString();
end = localDate.toString();
if (localDate.isAfter(LocalDate.now())) {
break;
}
}
}
}
// 接收到cancel命令时取消数据生成
@Override
public void cancel() {
flag = false;
}
@Override
public void close() throws Exception {
if (conn != null)
conn.close();
if (stmt != null)
stmt.close();
if (rs != null)
rs.close();
}
}
4.自定义sink
注意:
1.流式处理要求实时性,可即时插入,数据量大时会有性能问题,可以转存kafka
2.对于历史数据处理任务,流式数据做批量插入,可缓存后批量插入,最后一批数据未达到batchSize时会丢失,需要在close方法中再做一次处理。
import com.sumec.flink.entity.UserProductTagDto;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
import ru.yandex.clickhouse.ClickHouseConnection;
import ru.yandex.clickhouse.ClickHouseDataSource;
import ru.yandex.clickhouse.ClickHouseStatement;
import ru.yandex.clickhouse.settings.ClickHouseProperties;
import ru.yandex.clickhouse.settings.ClickHouseQueryParam;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.UUID;
@Slf4j
public class ClickHouseSink extends RichSinkFunction<List<UserProductTagDto>> {
private ClickHouseConnection conn = null;
private PreparedStatement stmt = null;
private ResultSet rs = null;
private Map<ClickHouseQueryParam, String> additionalDBParams = new HashMap<>();
private final int batchSize = 10000;
private String sql = "insert into";
// open只执行一次,适合开启资源
@Override
public void open(Configuration parameters) throws Exception {
String url = "jdbc:clickhouse://xxxxxx:8123";
ClickHouseProperties properties = new ClickHouseProperties();
properties.setSessionId(UUID.randomUUID().toString());
// properties.setUser("default");
// properties.setPassword("");
properties.setDatabase("xxxx");
ClickHouseDataSource dataSource = new ClickHouseDataSource(url, properties);
// ClickHouseProperties
additionalDBParams.put(ClickHouseQueryParam.SESSION_ID, UUID.randomUUID().toString());
conn = dataSource.getConnection();
stmt = conn.createStatement(sql);
}
@Override
public void invoke(List<UserProductTagDto> value, Context context) throws Exception {
if (CollectionUtils.isEmpty(value)) {
return;
}
for (UserProductTagDto dto : value) {
preparedStatement.setString(1, dto.getUserNo());
preparedStatement.setString(2, dto.getAnalysisNo());
preparedStatement.setString(3, dto.getProductTagNo());
preparedStatement.setInt(4, dto.getActionType());
preparedStatement.setString(5, dto.getEventTime());
preparedStatement.setString(6, DateFormatUtils.format(new Date(), "yyyy-MM-dd HH:mm:ss"));
preparedStatement.addBatch();
count++;
}
// 流式数据缓存后,批量插入clickhouse,能大大提高性能
if (count >= batchSize) {
preparedStatement.executeBatch();
count = 0;
}
// 按需,也可以即时处理
// preparedStatement.executeBatch();
}
@Override
public void close() throws Exception {
if (count != 0) {
// 缓存批量插入,注意插入最后一批数据,防止未达到batchSize丢失
preparedStatement.executeBatch();
}
if (conn != null)
conn.close();
if (stmt != null)
stmt.close();
if (rs != null)
rs.close();
}
}