– 设置发布为true
update pg_publication set puballtables=true where pubname is not null;
– 把所有表进行发布
CREATE PUBLICATION dbz_publication FOR ALL TABLES;
– 查询哪些表已经发布
select * from pg_publication_tables;
4、更改表的复制标识包含更新和删除的值
– 更改复制标识包含更新和删除之前值
ALTER TABLE xxxxxx REPLICA IDENTITY FULL;
– 查看复制标识(为f标识说明设置成功)
select relreplident from pg_class where relname=‘xxxxxx’;
二、Flink读取PG数据
1、加载依赖
<dependency>
<groupId>com.ververica</groupId>
<artifactId>flink-connector-postgres-cdc</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>org.postgresql</groupId>
<artifactId>postgresql</artifactId>
<version>42.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_${scala.version}</artifactId>
<version>${flink.version}</version>
<exclusions>
<exclusion>
<artifactId>kafka-clients</artifactId>
<groupId>org.apache.kafka</groupId>
</exclusion>
</exclusions>
</dependency>
注意:如果依赖中有flink-connector-kafka,可能会有冲突,需要手动排除冲突
2、使用Flink CDC创建pg的source
import com.ververica.cdc.connectors.postgres.PostgreSQLSource;
import com.ververica.cdc.debezium.JsonDebeziumDeserializationSchema;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
public static SourceFunction getPGSource(String database, String schemaList,String tableList, String slotName) {
Properties properties = new Properties();
properties.setProperty("snapshot.mode", "always"); //always:全量+增量 never:增量
properties.setProperty("debezium.slot.name", "pg_cdc");
//在作业停止后自动清理 slot
properties.setProperty("debezium.slot.drop.on.stop", "true");
properties.setProperty("include.schema.changes", "true");
// PostGres 数据库
SourceFunction<String> sourceFunction = PostgreSQLSource.<String>builder()
.hostname("localhost")
.port(5432)
.database(database) // monitor postgres database
.schemaList(schemaList) // monitor inventory schema
.tableList(tableList) // monitor products table 支持正则表达式
.username("postgres")
.password("postgres")
.decodingPluginName("pgoutput")
//Flink CDC 默认一张表占用一个 slot。多个未指定 slot.name 的连接会产生冲突。
.slotName(slotName)
.deserializer(new MyDebezium()) // 自定义序列化器,解决pg数据库日期格式的数据问题和时区问题
//.deserializer(new JsonDebeziumDeserializationSchema()) //
.debeziumProperties(properties)
.build();
return sourceFunction;
}
3、自定义序列化器
import com.alibaba.fastjson.JSONObject;
import com.ververica.cdc.debezium.DebeziumDeserializationSchema;
import com.ververica.cdc.debezium.utils.TemporalConversions;
import io.debezium.time.*;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.util.Collector;
import org.apache.kafka.connect.data.Field;
import org.apache.kafka.connect.data.SchemaBuilder;
import org.apache.kafka.connect.data.Struct;
import org.apache.kafka.connect.source.SourceRecord;
import java.time.Instant;
import java.time.LocalDateTime;
import java.time.ZoneId;
public class MyDebezium implements DebeziumDeserializationSchema<String> {
// 日期格式转换时区
private static String serverTimeZone = "Asia/Shanghai";
@Override
public void deserialize(SourceRecord sourceRecord, Collector<String> collector) throws Exception {
// 1. 创建一个JSONObject用来存放最终封装好的数据
JSONObject result = new JSONObject();
// 2. 解析主键
Struct key = (Struct)sourceRecord.key();
JSONObject keyJs = parseStruct(key);
// 3. 解析值
Struct value = (Struct) sourceRecord.value();
Struct source = value.getStruct("source");
JSONObject beforeJson = parseStruct(value.getStruct("before"));
JSONObject afterJson = parseStruct(value.getStruct("after"));
//将数据封装到JSONObject中
result.put("db", source.get("db").toString().toLowerCase());
//result.put("schema", source.get("schema").toString().toLowerCase()); 架构名 看是否需要
result.put("table", source.get("table").toString().toLowerCase());
result.put("key", keyJs);
result.put("op", value.get("op").toString());
result.put("op_ts", LocalDateTime.ofInstant(Instant.ofEpochMilli(source.getInt64("ts_ms")), ZoneId.of(serverTimeZone)));
result.put("current_ts", LocalDateTime.ofInstant(Instant.ofEpochMilli(value.getInt64("ts_ms")), ZoneId.of(serverTimeZone)));
result.put("before", beforeJson);
result.put("after", afterJson);
//将数据发送至下游
collector.collect(result.toJSONString());
}
private JSONObject parseStruct(Struct valueStruct) {
if (valueStruct == null) return null;
JSONObject dataJson = new JSONObject();
for (Field field : valueStruct.schema().fields()) {
Object v = valueStruct.get(field);
String type = field.schema().name();
Object val = null;
if (v instanceof Long) {
long vl = (Long) v;
val = convertLongToTime(vl, type);
} else if (v instanceof Integer){
int iv = (Integer) v;
val = convertIntToDate(iv, type);
} else if (v == null) {
val = null;
} else {
val = convertObjToTime(v, type);
}
dataJson.put(field.name().toLowerCase(), val);
}
return dataJson;
}
private Object convertObjToTime(Object obj, String type) {
Object val = obj;
if (Time.SCHEMA_NAME.equals(type) || MicroTime.SCHEMA_NAME.equals(type) || NanoTime.SCHEMA_NAME.equals(type)) {
val = java.sql.Time.valueOf(TemporalConversions.toLocalTime(obj)).toString();
} else if (Timestamp.SCHEMA_NAME.equals(type) || MicroTimestamp.SCHEMA_NAME.equals(type) || NanoTimestamp.SCHEMA_NAME.equals(type) || ZonedTimestamp.SCHEMA_NAME.equals(type)) {
val = java.sql.Timestamp.valueOf(TemporalConversions.toLocalDateTime(obj, ZoneId.of(serverTimeZone))).toString();
}
return val;
}
private Object convertIntToDate(int obj, String type) {
SchemaBuilder date_schema = SchemaBuilder.int64().name("org.apache.kafka.connect.data.Date");
Object val = obj;
if (Date.SCHEMA_NAME.equals(type)) {
val = org.apache.kafka.connect.data.Date.toLogical(date_schema, obj).toInstant().atZone(ZoneId.of(serverTimeZone)).toLocalDate().toString();
}
return val;
}
private Object convertLongToTime(long obj, String type) {
SchemaBuilder time_schema = SchemaBuilder.int64().name("org.apache.kafka.connect.data.Time");
SchemaBuilder date_schema = SchemaBuilder.int64().name("org.apache.kafka.connect.data.Date");
SchemaBuilder timestamp_schema = SchemaBuilder.int64().name("org.apache.kafka.connect.data.Timestamp");
Object val = obj;
if (Time.SCHEMA_NAME.equals(type)) {
val = org.apache.kafka.connect.data.Time.toLogical(time_schema, (int)obj).toInstant().atZone(ZoneId.of(serverTimeZone)).toLocalTime().toString();
} else if (MicroTime.SCHEMA_NAME.equals(type)) {
val = org.apache.kafka.connect.data.Time.toLogical(time_schema, (int)(obj / 1000)).toInstant().atZone(ZoneId.of(serverTimeZone)).toLocalTime().toString();
} else if (NanoTime.SCHEMA_NAME.equals(type)) {
val = org.apache.kafka.connect.data.Time.toLogical(time_schema, (int)(obj / 1000 / 1000)).toInstant().atZone(ZoneId.of(serverTimeZone)).toLocalTime().toString();
} else if (Timestamp.SCHEMA_NAME.equals(type)) {
LocalDateTime t = org.apache.kafka.connect.data.Timestamp.toLogical(timestamp_schema, obj).toInstant().atZone(ZoneId.of(serverTimeZone)).toLocalDateTime();
val = java.sql.Timestamp.valueOf(t).toString();
} else if (MicroTimestamp.SCHEMA_NAME.equals(type)) {
LocalDateTime t = org.apache.kafka.connect.data.Timestamp.toLogical(timestamp_schema, obj / 1000).toInstant().atZone(ZoneId.of(serverTimeZone)).toLocalDateTime();
val = java.sql.Timestamp.valueOf(t).toString();
} else if (NanoTimestamp.SCHEMA_NAME.equals(type)) {
LocalDateTime t = org.apache.kafka.connect.data.Timestamp.toLogical(timestamp_schema, obj / 1000 / 1000).toInstant().atZone(ZoneId.of(serverTimeZone)).toLocalDateTime();
val = java.sql.Timestamp.valueOf(t).toString();
} else if (Date.SCHEMA_NAME.equals(type)) {
val = org.apache.kafka.connect.data.Date.toLogical(date_schema, (int)obj).toInstant().atZone(ZoneId.of(serverTimeZone)).toLocalDate().toString();
}
return val;
}
@Override
public TypeInformation<String> getProducedType() {
return BasicTypeInfo.STRING_TYPE_INFO;
}
}
注:这段代码来自于https://huaweicloud.csdn.net/63356ef0d3efff3090b56c01.html中的scala版本
自我介绍一下,小编13年上海交大毕业,曾经在小公司待过,也去过华为、OPPO等大厂,18年进入阿里一直到现在。
深知大多数大数据工程师,想要提升技能,往往是自己摸索成长或者是报班学习,但对于培训机构动则几千的学费,着实压力不小。自己不成体系的自学效果低效又漫长,而且极易碰到天花板技术停滞不前!
因此收集整理了一份《2024年大数据全套学习资料》,初衷也很简单,就是希望能够帮助到想自学提升又不知道该从何学起的朋友。
既有适合小白学习的零基础资料,也有适合3年以上经验的小伙伴深入学习提升的进阶课程,基本涵盖了95%以上大数据开发知识点,真正体系化!
由于文件比较大,这里只是将部分目录大纲截图出来,每个节点里面都包含大厂面经、学习笔记、源码讲义、实战项目、讲解视频,并且后续会持续更新
如果你觉得这些内容对你有帮助,可以添加VX:vip204888 (备注大数据获取)
一个人可以走的很快,但一群人才能走的更远。不论你是正从事IT行业的老鸟或是对IT行业感兴趣的新人,都欢迎扫码加入我们的的圈子(技术交流、学习资源、职场吐槽、大厂内推、面试辅导),让我们一起学习成长!
部分目录大纲截图出来,每个节点里面都包含大厂面经、学习笔记、源码讲义、实战项目、讲解视频,并且后续会持续更新**
如果你觉得这些内容对你有帮助,可以添加VX:vip204888 (备注大数据获取)
[外链图片转存中…(img-8S4cNmG3-1712961325106)]
一个人可以走的很快,但一群人才能走的更远。不论你是正从事IT行业的老鸟或是对IT行业感兴趣的新人,都欢迎扫码加入我们的的圈子(技术交流、学习资源、职场吐槽、大厂内推、面试辅导),让我们一起学习成长!