1、用户画像体系部分分实时标签和离线标签,具体方案参考了阿里云用户画像标签体系的架构方案并加以修改,采用lamda架构的方式通过离线标签T-1修正实时标签策略。
2、架构设计
参考阿里云架构体系:
修正的架构体系:
基于MaxCompute+Hologres的人群圈选和数据服务实践-阿里云开发者社区https://developer.aliyun.com/article/792500
3、逻辑实现
由于Hologres版本过低,0.8版本不支持Blink source sink ,才有APi方式实现
①pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.alibaba.blink</groupId>
<artifactId>blink-udx-3.x</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<scala.version>2.11.12</scala.version>
<scala.binary.version>2.11</scala.binary.version>
<blink.version>blink-3.3.0</blink.version>
<java.version>1.8</java.version>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<sdk.version>0.38.3-public</sdk.version>
</properties>
<dependencies>
<dependency>
<groupId>com.alibaba.blink</groupId>
<artifactId>flink-core</artifactId>
<version>${blink.version}</version>
<!-- <scope>provided</scope>-->
<!-- <systemPath>${project.basedir}/lib/flink-core-blink-3.2.2.jar</systemPath>-->
</dependency>
<dependency>
<groupId>com.alibaba.blink</groupId>
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
<version>${blink.version}</version>
<!-- <scope>provided</scope>-->
<!-- <systemPath>${project.basedir}/lib/flink-streaming-java_2.11-blink-3.2.2.jar</systemPath>-->
</dependency>
<!--打包udf自定义函数需要添加此依赖-->
<dependency>
<groupId>com.alibaba.blink</groupId>
<artifactId>flink-table_${scala.binary.version}</artifactId>
<version>${blink.version}</version>
<!-- <scope>provided</scope>-->
<!-- <systemPath>${project.basedir}/lib/flink-table_2.11-blink-3.2.2.jar</systemPath>-->
</dependency>
<dependency>
<groupId>com.alibaba.blink</groupId>
<artifactId>flink-table-common</artifactId>
<version>${blink.version}</version>
<!-- <scope>provided</scope>-->
<!-- <systemPath>${project.basedir}/lib/flink-table_2.11-blink-3.2.2.jar</systemPath>-->
</dependency>
<!-- https://mvnrepository.com/artifact/com.alibaba.blink/flink-streaming-scala -->
<dependency>
<groupId>com.alibaba.blink</groupId>
<artifactId>flink-streaming-scala_${scala.binary.version}</artifactId>
<version>${blink.version}</version>
<!-- <scope>provided</scope>-->
</dependency>
<dependency>
<groupId>com.alibaba.blink</groupId>
<artifactId>flink-java</artifactId>
<version>${blink.version}</version>
</dependency>
<dependency>
<groupId>com.alibaba.blink</groupId>
<artifactId>flink-scala_${scala.binary.version}</artifactId>
<version>${blink.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.alibaba.blink/flink-jdbc -->
<dependency>
<groupId>com.alibaba.blink</groupId>
<artifactId>flink-jdbc</artifactId>
<version>${blink.version}</version>
</dependency>
<dependency>
<groupId>com.alibaba.blink</groupId>
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
<version>${blink.version}</version>
<!-- <scope>provided</scope>-->
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.11.12</version>
<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.scala-lang/scala-reflect -->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-reflect</artifactId>
<version>2.11.12</version>
<scope>provided</scope>
</dependency>
<!-- <dependency>-->
<!-- <groupId>com.aliyun.datahub</groupId>-->
<!-- <artifactId>aliyun-sdk-datahub</artifactId>-->
<!-- <version>2.12.2-public</version>-->
<!-- </dependency>-->
<dependency>
<groupId>com.aliyun.datahub</groupId>
<artifactId>aliyun-sdk-datahub</artifactId>
<version>2.12.2-public</version>
<scope>provided</scope>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>jcl-over-slf4j</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>jul-slf4j</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
<!--<exclusion>-->
<!--<artifactId>jackson-databind</artifactId>-->
<!--<groupId>com.fasterxml.jackson.core</groupId>-->
<!--</exclusion>-->
<!--<exclusion>-->
<!--<artifactId>jackson-annotations</artifactId>-->
<!--<groupId>com.fasterxml.jackson.core</groupId>-->
<!--</exclusion>-->
</exclusions>
</dependency>
<dependency>
<groupId>com.aliyun.emr</groupId>
<artifactId>emr-datahub_2.11</artifactId>
<version>2.0.0</version>
<scope>provided</scope>
</dependency>
<!-- 打包datahubsink-datahubsource需要这个依赖-->
<dependency>
<groupId>com.alibaba.flink</groupId>
<artifactId>datahub-connector</artifactId>
<version>0.1-SNAPSHOT</version>
<classifier>jar-with-dependencies</classifier>
<!-- <scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.postgresql</groupId>
<artifactId>postgresql</artifactId>
<version>42.1.1</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.76</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>com.alibaba.blink</groupId>
<artifactId>flink-connector-kafka-0.10_2.11</artifactId>
<version>${blink.version}</version>
<scope>provided</scope>
</dependency>
<!--采用Redis做维表缓存-->
<dependency>
<groupId>io.vertx</groupId>
<artifactId>vertx-core</artifactId>
<version>3.5.2</version>
</dependency>
<dependency>
<groupId>io.vertx</groupId>
<artifactId>vertx-redis-client</artifactId>
<version>3.5.2.CR3</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.46</version>
</dependency>
<!--odpssink需要这个依赖-->
<dependency>
<groupId>com.aliyun.odps</groupId>
<artifactId>odps-sdk-core</artifactId>
<version>${sdk.version}</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>com.aliyun.odps</groupId>
<artifactId>odps-sdk-udf</artifactId>
<version>${sdk.version}</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>com.aliyun.odps</groupId>
<artifactId>odps-udf-local</artifactId>
<version>${sdk.version}</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>com.aliyun.odps</groupId>
<artifactId>odps-sdk-mapred</artifactId>
<version>${sdk.version}</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>com.aliyun.odps</groupId>
<artifactId>odps-mapred-local</artifactId>
<version>${sdk.version}</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>com.aliyun.odps</groupId>
<artifactId>odps-sdk-graph</artifactId>
<version>${sdk.version}</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>com.aliyun.odps</groupId>
<artifactId>odps-graph-local</artifactId>
<version>${sdk.version}</version>
<!--<scope>provided</scope>-->
</dependency>
<!--JDBC ODPS依赖-->
<dependency>
<groupId>com.aliyun.odps</groupId>
<artifactId>odps-jdbc</artifactId>
<version>3.0.1</version>
<classifier>jar-with-dependencies</classifier>
</dependency>
<!--java工具类-->
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>4.6.3</version>
</dependency>
</dependencies>
<!-- <build>-->
<!-- <finalName>${project.artifactId}</finalName>-->
<!-- <outputDirectory>target/classes</outputDirectory>-->
<!-- <testOutputDirectory>target/test-classes</testOutputDirectory>-->
<!-- <plugins>-->
<!-- <!– This plugin compiles Scala files –>-->
<!-- <plugin>-->
<!-- <groupId>net.alchim31.maven</groupId>-->
<!-- <artifactId>scala-maven-plugin</artifactId>-->
<!-- <version>3.2.2</version>-->
<!-- <executions>-->
<!-- <execution>-->
<!-- <id>scala-compile-first</id>-->
<!-- <phase>process-resources</phase>-->
<!-- <goals>-->
<!-- <goal>add-source</goal>-->
<!-- <goal>compile</goal>-->
<!-- </goals>-->
<!-- </execution>-->
<!-- <execution>-->
<!-- <id>scala-test-compile</id>-->
<!-- <phase>process-test-resources</phase>-->
<!-- <goals>-->
<!-- <goal>testCompile</goal>-->
<!-- </goals>-->
<!-- </execution>-->
<!-- </executions>-->
<!-- </plugin>-->
<!-- <!– This plugin compiles Java files –>-->
<!-- <plugin>-->
<!-- <groupId>org.apache.maven.plugins</groupId>-->
<!-- <artifactId>maven-compiler-plugin</artifactId>-->
<!-- <version>3.5.1</version>-->
<!-- <configuration>-->
<!-- <source>1.8</source>-->
<!-- <target>1.8</target>-->
<!-- </configuration>-->
<!-- <executions>-->
<!-- <execution>-->
<!-- <phase>compile</phase>-->
<!-- <goals>-->
<!-- <goal>compile</goal>-->
<!-- </goals>-->
<!-- </execution>-->
<!-- </executions>-->
<!-- </plugin>-->
<!-- </plugins>-->
<!-- </build>-->
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.1.1</version>
<configuration>
<archive>
<manifest>
<mainClass>com.alibaba.blink.demo.ods.Ods_Stream_Demo</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
datahub source 需要手动加载maven仓库到本地,参考:读取DataHub数据示例 - 实时计算Flink版 - 阿里云https://help.aliyun.com/document_detail/156813.html② 入口类
package com.alibaba.blink.demo.ods;
import com.alibaba.blink.mojo.Record;
import com.alibaba.blink.sink.holo.HoloSink;
import com.alibaba.blink.sink.holo.HoloSink1;
import com.alibaba.blink.sink.ods.DatahubSink;
import com.alibaba.blink.sink.ods.OdpsSink;
import com.alibaba.blink.source.ods.Event_info;
import com.alibaba.flink.connectors.datahub.datastream.source.DatahubSourceFunction;
import com.aliyun.datahub.client.model.RecordEntry;
import com.aliyun.odps.jdbc.utils.OdpsLogger;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import java.io.Serializable;
import java.util.List;
public class Ods_Stream_Demo implements Serializable {
// private static String endPoint = ConfigPropUtils.get(" datahub_endpoint");
// //private static String endPoint ="public endpoint";//公网访问(填写内网Endpoint,就不用填写公网Endpoint)。
// private static String projectName = ConfigPropUtils.get("datahub_projectname");
// private static String topicSourceName = ConfigPropUtils.get("datahub_source_topic");
// private static String topicSinkName = ConfigPropUtils.get("datahub_sink_topic");
// private static String accessId = ConfigPropUtils.get(" accessId");
// private static String accessKey = ConfigPropUtils.get(" accessKey");
// //设置消费的启动位点对应的时间。TimeToStampUtil.timeToStamp("2021-12-21") 此时间至少为当前时间
// private static Long datahubStartInMs = TimeToStampUtil.timeToStamp("2021-12-14");
// private static Long datahubEndInMs=Long.MAX_VALUE;
public static void main(String[] args) throws Exception {
OdpsLogger logger = new OdpsLogger(Ods_Stream_Demo.class.getName(),OdpsLogger.getDefaultOutputPath(),true,"");
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<List<RecordEntry>> listDataStreamSource = env.addSource(
new DatahubSourceFunction(
Event_info.endPoint
,Event_info.projectName
,Event_info.topicSourceName
,Event_info.accessId
,Event_info.accessKey
,Event_info.datahubStartInMs
,Event_info.datahubEndInMs
,20L
,1000L
,1000
));
DataStream<Record> result = listDataStreamSource
.flatMap((FlatMapFunction<List<RecordEntry>, Record>) (ls, collector) -> {
for (RecordEntry recordEntry : ls) {
Record record = new Record(recordEntry);
collector.collect(record);
}
}).returns(Record.class)
.filter(s -> (s.getOneid() != null))
.filter(s -> (s.getEvent() != null))
;
result.print();
result.addSink(new HoloSink()).setParallelism(10);
result.addSink(new OdpsSink()).setParallelism(10);
result.addSink(new HoloSink1()).setParallelism(10);
result.addSink(new DatahubSink());
env.execute();
}
}
③Event_info
package com.alibaba.blink.source.ods;
import com.alibaba.blink.utils.ConfigPropUtils;
import com.alibaba.blink.utils.TimeToStampUtil;
import java.text.SimpleDateFormat;
import java.util.Date;
/**
* @author yangyingchun
* @version 1.0
* @date 2021/12/14 15:55
*/
public class Event_info {
static SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
public static String endPoint = ConfigPropUtils.get("my_datahub_endpoint");
//public static String endPoint ="public endpoint";//公网访问(填写内网Endpoint,就不用填写公网Endpoint)。
public static String projectName = ConfigPropUtils.get("ods.datahub_projectname");
public static String topicSourceName = ConfigPropUtils.get("ods.datahub_source_topic.event_info");
public static String topicSinkName = ConfigPropUtils.get("datahub_sink_topic");
public static String accessId = ConfigPropUtils.get("my_accessId");
public static String accessKey = ConfigPropUtils.get("my_accessKey");
//设置消费的启动位点对应的时间。TimeToStampUtil.timeToStamp("2021-12-21") 此时间至少为当前时间
public static Long datahubStartInMs = TimeToStampUtil.timeToStamp(formatter.format(new Date()));
public static Long datahubEndInMs=Long.MAX_VALUE;
public static void main(String[] args) {
Date currentTime = new Date();
String dateString = formatter.format(currentTime);
System.out.println(dateString);
}
}
④resource/config.properties
my_datahub_endpoint=https://datahub.cn-********.cn
datahub_projectname=itsl
datahub_source_topic=event_info
datahub_source_topic1=event_info1
datahub_sink_topic=datahub_sink
my_accessId=3231321************
my_accessKey=32132131321***********
my_holo_url=jdbc:postgresql://holo-cn-bzdcbpeb21b7-********:80/ods?tcpKeepAlive=true
postgresdriver=org.postgresql.Driver
my_odps_endpoint=http://service.cn-**********.cn/api
my_odps_project=ODS
my_odps_driver=com.aliyun.odps.jdbc.OdpsDriver
my_odps_url=jdbc:odps:http://service.cn-**********.cn/api?project=ODS&charset=UTF-8&interactiveMode=true
kafka.topic=topic_name
bootstrap.servers=localhost:9092
zookeeper.connect=localhost:2181
group.id001=customer-001
##数仓相关
###############################ODS###############################
ods.datahub_projectname=ODS
ods.datahub_source_topic.event_info=event_info
###############################ODS###############################
###############################DWD###############################
dwd.datahub_projectname=ODS
dwd.datahub_source_topic.datahubsink=datahub_sink
###############################DWD###############################
##测试相关
testfile=E:\\software\\workspace\\blink_udx_3x-master\\src\\main\\resources\\testfile
⑤工具类
package com.alibaba.blink.utils;
import java.io.InputStream;
import java.util.Properties;
public class ConfigPropUtils {
private static Properties props;
static{
try {
props = new Properties();
//"config.properties"放在classpath(类路径)下
InputStream in = ConfigPropUtils.class.getClassLoader().getResourceAsStream("config.properties");
props.load(in);
} catch (Exception e) {
e.printStackTrace();
}
}
//根据配置文件中的key获取value
public static String get(String key){
return props.getProperty(key, null);
}
}
⑥自定义Sink
a) HoloSink 更新Oneid
package com.alibaba.blink.sink.holo;
import com.alibaba.blink.mojo.Record;
import com.alibaba.blink.utils.ConfigPropUtils;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.util.UUID;
public class HoloSink extends RichSinkFunction<Record> {
private static String url= ConfigPropUtils.get("my_holo_url");
private static String username = ConfigPropUtils.get("my_accessId");
private static String password = ConfigPropUtils.get("my_accessKey");
private static String postgresdriver = ConfigPropUtils.get("postgresdriver");
private Connection connection;
private ThreadLocal<PreparedStatement> pstmt;
private ThreadLocal<PreparedStatement> querymt;
private ThreadLocal<PreparedStatement> updatemt;
private Connection getConnection() {
Connection conn = null;
try {
Class.forName(postgresdriver);
conn = DriverManager.getConnection(url, username, password);
} catch (Exception e) {
e.printStackTrace();
}
return conn;
}
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
this.connection = getConnection();
this.pstmt = new ThreadLocal<>();
this.querymt = new ThreadLocal<>();
this.updatemt = new ThreadLocal<>();
}
@Override
public void invoke(Record record, Context context) throws Exception {
if ( null == record || record.getOneid() == null) {
System.out.println("record is null!!!");
return;
}
//哪个线程空着呢 用哪个线程
if (this.querymt.get() == null)
this.querymt.set(this.connection.prepareStatement("select oneid,event from holo_sink where oneid=?"));
if (this.updatemt.get() == null)
this.updatemt.set(this.connection.prepareStatement("update holo_sink set event=? where oneid=?"));
if (this.pstmt.get() == null)
this.pstmt.set(this.connection.prepareStatement("insert into holo_sink(oneid,event) values (?,?)"));
((PreparedStatement)this.querymt.get()).setString(1, record.getOneid());
ResultSet resultSet = ((PreparedStatement)this.querymt.get()).executeQuery();
// System.out.println(resultSet.getFetchSize());
//如果oneid存在 即更新
if ( resultSet.next() ) {
((PreparedStatement)this.updatemt.get()).setString(1, record.getEvent());
((PreparedStatement)this.updatemt.get()).setString(2, record.getOneid());
((PreparedStatement)this.updatemt.get()).executeUpdate();
System.out.println("update " + record.toString() + ",threadId:" + Thread.currentThread().getId());
// oneid不存在就插入 并赋值新的oneid
}
else {
((PreparedStatement)this.pstmt.get()).setString(1, UUID.randomUUID().toString());
((PreparedStatement)this.pstmt.get()).setString(2, record.getEvent());
((PreparedStatement)this.pstmt.get()).executeUpdate();
System.out.println("insert " + record.toString() + ",threadId:" + Thread.currentThread().getId());
}
}
@Override
public void close() throws Exception {
super.close();
if (this.pstmt.get() != null)
((PreparedStatement)this.pstmt.get()).close();
if (this.querymt.get() != null)
((PreparedStatement)this.querymt.get()).close();
if (this.updatemt.get() != null)
((PreparedStatement)this.updatemt.get()).close();
if (this.connection != null)
this.connection.close();
}
}
b)OdpsSink 沉淀离线数据
package com.alibaba.blink.sink.ods;
import com.alibaba.blink.mojo.Record;
import com.alibaba.blink.utils.ConfigPropUtils;
import com.alibaba.blink.utils.ConnOdpsUtil;
import com.aliyun.odps.Odps;
import com.aliyun.odps.account.Account;
import com.aliyun.odps.account.AliyunAccount;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.Statement;
import java.util.UUID;
/**
* @author yangyingchun
* @version 1.0
* @date 2021/12/20 8:45
*/
public class OdpsSink extends RichSinkFunction<Record> {
// Endpoint以Region: 华东1为例,其他Region请按实际情况填写
// static String endpoint = ConfigPropUtils.get("my_odps_endpoint");
// static String projectName = ConfigPropUtils.get("my_odps_project");
// static String accessId = ConfigPropUtils.get("my_accessId");
// static String accessKey = ConfigPropUtils.get("my_accessKey");
// static Odps odps;
// static Account account;
// private ThreadLocal<Statement> stmt;
static Connection odpsConn;
static ThreadLocal<PreparedStatement> preparedStatement;
// static ThreadLocal<PreparedStatement> pstmt;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
// account = new AliyunAccount(accessId, accessKey);
// odps = new Odps(account);
// odps.setEndpoint(endpoint);
// odps.setDefaultProject(projectName);
odpsConn = ConnOdpsUtil.getOdpsConn();
this.preparedStatement = new ThreadLocal<>();
}
@Override
public void invoke(Record record, Context context) throws Exception {
String sql = "insert into table odps_sink values (?,?,?)" ;
if (this.preparedStatement.get() == null)
this.preparedStatement.set(odpsConn.prepareStatement(sql));
preparedStatement.get().setString(1,record.getOneid());
preparedStatement.get().setString(2,record.getEvent());
preparedStatement.get().setString(3, UUID.randomUUID().toString());
int i = preparedStatement.get().executeUpdate();
System.out.println("successfully updated "+ i +" records,ThreadID:"+ Thread.currentThread().getId());
}
}
c) DataHub Sink 实时数仓体系
package com.alibaba.blink.sink.ods;
import com.alibaba.blink.mojo.Record;
import com.alibaba.blink.utils.ConfigPropUtils;
import com.aliyun.datahub.client.DatahubClient;
import com.aliyun.datahub.client.DatahubClientBuilder;
import com.aliyun.datahub.client.auth.AliyunAccount;
import com.aliyun.datahub.client.common.DatahubConfig;
import com.aliyun.datahub.client.http.HttpConfig;
import com.aliyun.datahub.client.model.PutRecordsResult;
import com.aliyun.datahub.client.model.RecordEntry;
import com.aliyun.datahub.client.model.RecordSchema;
import com.aliyun.datahub.client.model.TupleRecordData;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
import java.util.ArrayList;
import java.util.List;
/**
* @author yangyingchun
* @version 1.0
* @date 2021/12/14 13:25
*/
public class DatahubSink extends RichSinkFunction<Record> {
// Endpoint以Region: 华东1为例,其他Region请按实际情况填写
static String endpoint = ConfigPropUtils.get("my_datahub_endpoint");
//private static String endPoint ="public endpoint";//公网访问(填写内网Endpoint,就不用填写公网Endpoint)。
static String projectName = ConfigPropUtils.get("datahub_projectname");
static String topicSinkName = ConfigPropUtils.get("datahub_sink_topic");
static String accessId = ConfigPropUtils.get("my_accessId");
static String accessKey = ConfigPropUtils.get("my_accessKey");
static DatahubClient datahubClient;
static RecordSchema recordSchema;
static int retryTimes = 10;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
datahubClient = DatahubClientBuilder.newBuilder()
.setDatahubConfig(
new DatahubConfig(endpoint,
// 是否开启二进制传输,服务端2.12版本开始支持
new AliyunAccount(accessId, accessKey), true))
//专有云使用出错尝试将参数设置为 false
// HttpConfig可不设置,不设置时采用默认值
.setHttpConfig(new HttpConfig()
.setCompressType(HttpConfig.CompressType.LZ4) // 读写数据推荐打开网络传输 LZ4压缩
.setConnTimeout(10000))
.build();
// 获取schema
recordSchema = datahubClient.getTopic(projectName,topicSinkName ).getRecordSchema();
}
@Override
public void invoke(Record record, Context context) throws Exception {
List<RecordEntry> recordEntries = new ArrayList<>();
RecordEntry recordEntry = new RecordEntry();
// 对每条数据设置额外属性,例如ip 机器名等。可以不设置额外属性,不影响数据写入
recordEntry.addAttribute("key2", "value2");
TupleRecordData data = new TupleRecordData(recordSchema);
data.setField("oneid", record.getOneid());
data.setField("message", record.getEvent());
data.setField("if_exists", String.valueOf(Math.random()));
recordEntry.setRecordData(data);
recordEntries.add(recordEntry);
datahubClient.putRecords(projectName, topicSinkName, recordEntries);
}
}
d)实时标签
package com.alibaba.blink.demo.holo;
import com.alibaba.blink.mojo.GetResultSet;
import com.alibaba.blink.sink.holo.Holo_Write_Sink;
import com.alibaba.blink.mojo.Record;
import com.alibaba.blink.utils.ConfigPropUtils;
import com.alibaba.flink.connectors.datahub.datastream.source.DatahubSourceFunction;
import com.aliyun.datahub.client.model.RecordEntry;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
/**
* @author yangyingchun
* @version 1.0
* @date 2021/12/13 14:46
*/
public class Stream_Holo_Tags {
private static String endPoint = ConfigPropUtils.get("my_datahub_endpoint");
//private static String endPoint ="public endpoint";//公网访问(填写内网Endpoint,就不用填写公网Endpoint)。
private static String projectName = ConfigPropUtils.get("datahub_projectname");
private static String topicSourceName = ConfigPropUtils.get("datahub_source_topic1");
private static String accessId = ConfigPropUtils.get("my_accessId");
private static String accessKey = ConfigPropUtils.get("my_accessKey");
private static Long datahubStartInMs = 0L;//设置消费的启动位点对应的时间。
private static Long datahubEndInMs=Long.MAX_VALUE;
private static List<String> oneids = new ArrayList<>();
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(8);
DataStreamSource<List<RecordEntry>> listDataStreamSource = env.addSource(
new DatahubSourceFunction(endPoint,
projectName,
topicSourceName,
accessId,
accessKey,
datahubStartInMs,
datahubEndInMs,
20L,
1000L,
1000))
;
SingleOutputStreamOperator<Record> result = listDataStreamSource.flatMap(new FlatMapFunction<List<RecordEntry>, Record>() {
@Override
public void flatMap(List<RecordEntry> recordEntries, Collector<Record> collector) throws Exception {
for (RecordEntry recordEntry : recordEntries) {
Record record = new Record(recordEntry);
collector.collect(record);
}
}
})
//map方法的Opend 方法读取的数据可以加载到内存中
.map(new RichMapFunction<Record, Record>() {
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
oneids = GetResultSet.getResultSet();
}
@Override
public Record map(Record record) throws Exception {
//如果包含Oneid 不需要新生成
if (oneids.contains(record.getOneid())){
return record;
//如果不包含Oneid 需要重新生成
}else {
return new Record(UUID.randomUUID().toString(),record.getEvent());
}
}
})
.filter(record ->
null!=record &&
record.getOneid() != null);
result.addSink(new Holo_Write_Sink()).setParallelism(8);
env.execute("join demo");
}
}
e)HoloWriteSink
package com.alibaba.blink.sink.holo;
import com.alibaba.blink.mojo.Record;
import com.alibaba.blink.mojo.TAGS_ID;
import com.alibaba.blink.mojo.TAGS_VALUES;
import com.alibaba.blink.utils.ConfigPropUtils;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.Timestamp;
import java.util.Date;
import java.util.Map;
/**
* @author yangyingchun
* @version 1.0
* @date 2021/12/13 14:48
*/
public class Holo_Write_Sink extends RichSinkFunction<Record> {
private static String url= ConfigPropUtils.get("my_holo_url");
private static String username = ConfigPropUtils.get("my_accessId");
private static String password = ConfigPropUtils.get("my_accessKey");
private static String postgresdriver = ConfigPropUtils.get("postgresdriver");
private Connection connection;
private ThreadLocal<PreparedStatement> pstmt;
private Connection getConnection() {
Connection conn = null;
try {
Class.forName(postgresdriver);
conn = DriverManager.getConnection(url, username, password);
} catch (Exception e) {
e.printStackTrace();
}
return conn;
}
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
super.open(parameters);
this.connection = getConnection();
this.pstmt = new ThreadLocal<>();
}
@Override
public void invoke(Record record, Context context) throws Exception {
if ( null == record || record.getOneid() == null) {
System.out.println("record is null!!!");
return;
}
if (this.pstmt.get() == null)
this.pstmt.set(this.connection.prepareStatement(
"insert into " +
"stream_tags" +
"(oneid,phone,tag_id,tag_value,create_time) " +
"values (?,?,?,?,?)"
)
);
this.pstmt.get().setString(1,record.getOneid());
this.pstmt.get().setString(2,"phone"+String.valueOf(Math.random()*1000+1));
this.pstmt.get().setString(3, String.valueOf(TAGS_ID.Key200001));
this.pstmt.get().setString(4, TAGS_VALUES.get_value(TAGS_ID.Key200001,record.getEvent()));
this.pstmt.get().setTimestamp(5, new Timestamp(System.currentTimeMillis()));
int i = this.pstmt.get().executeUpdate();
if (i>0){
System.out.println("!!!更新标签成功!!!"+Thread.currentThread().getId());
}
}
}