1、各组件版本
Blink 3.6.0
Datahub
Hologres 0.10
Rds 8.0/5.7
Maxcompute 3.5
2、维表关联代码
2.1 Maxcompute维表
--创建odps维表
create table idmapping_dim(
oneid VARCHAR ,
phone VARCHAR,
-- idcard VARCHAR,
-- vin VARCHAR,
-- name VARCHAR,
-- plate_num VARCHAR,
-- -- createtime TIMESTAMP ,
-- -- updatetime TIMESTAMP,
-- weight bigint,
-- ssoid VARCHAR,
-- uid VARCHAR,
PRIMARY KEY (oneid) ,
PERIOD FOR SYSTEM_TIME --维表的标识。 FOR SYSTEM_TIME AS OF
) with (
type = 'odps',
endpoint = 'http://service.cn-.odps.ops.cloud.gwm.cn/api',
-- tunnelEndpoint = '',
project = 'itsl_dev',
tablename = 'idmapping_dim',
accessid = 'w9dYf574jXtaUGzz',
accesskey = 'aruzJpV8mqlr9wXFTQTiOI3wV10GwK',
cache = 'ALL' --只支持cache-all
-- cache='LRU',
-- cacheSize = '100000'
-- maxRowCount = '100000' --默认值
);
--创建datahub消息源
create table datahub_source(
oneid VARCHAR,
event VARCHAR
) with (
type = 'datahub',
endPoint = 'https://datahub.cn-.dh.res.cloud.gwm.cn',
project = 'itsl',
topic = 'event_info',
subId = '',
accessId = 'w9dYf574jXtaUGzz',
accessKey = 'aruzJpV8mqlr9wXFTQTiOI3wV10GwK',
startTime='2022-03-01 12:00:00',--按实际修改
lengthCheck='NONE'
);
--创建结果表
CREATE TABLE print_table (
oneid varchar,
message varchar,
if_exists varchar
) WITH (
type = 'print',
logger = 'true'
);
INSERT INTO print_table
SELECT
if(idmapping_dim.oneid is null,uuid(),idmapping_dim.oneid) as oneid
,da.event as `message`
,if(idmapping_dim.oneid is null,'0','1') if_exists
from datahub_source da
left join idmapping_dim FOR SYSTEM_TIME AS OF PROCTIME()
on idmapping_dim.oneid=da.oneid
;
2.2 Rds维表
--创建rds维表
create table idmapping_dim (
oneid VARCHAR ,
phone VARCHAR,
idcard VARCHAR,
vin VARCHAR,
name VARCHAR,
plate_num VARCHAR,
weight INT,
ssoid VARCHAR,
uid VARCHAR,
data_date VARCHAR,
PRIMARY KEY (oneid),
PERIOD FOR SYSTEM_TIME--定义了维表的变化周期,即表明该表是⼀张会变化的表。
)
with (
type='rds',
url='jdbc:mysql://XXXX.**:3306/bigdata_sale',
host = '10.252.48.**',
port = '3306',
dbname = 'bigdata_sale',
tablename = 'idmapping_dim',
username = 'XXXX',
password='XXX',
-- cache = 'ALL'
cache = 'LRU',
cacheSize = '100000'
);
--创建datahub消息源
create table datahub_source(
oneid VARCHAR,
event VARCHAR
) with (
type = 'datahub',
endPoint = 'https://XXXXXXXX.cn-dh.res.cloud.gwm.cn',
project = 'itsl',
topic = 'event_info',
subId = '',
accessId = 'XXXXXXX',
accessKey = 'XXXXXXXXX',
startTime='2021-12-16 12:00:00',--按实际修改
lengthCheck='NONE'
);
--创建结果表
CREATE TABLE print_table (
oneid varchar,
message varchar,
if_exists varchar
) WITH (
type = 'print',
logger = 'true'
);
INSERT INTO print_table
SELECT
if(idmapping_dim.oneid is null,uuid(),idmapping_dim.oneid) as oneid
,da.event as `message`
,if(idmapping_dim.oneid is null,'0','1') if_exists
from datahub_source da
left join idmapping_dim FOR SYSTEM_TIME AS OF PROCTIME()
on idmapping_dim.oneid=da.oneid
;
2.3 Hologres维表
CREATE TABLE idmapping_dim(
oneid VARCHAR ,
phone VARCHAR ,
PRIMARY KEY (oneid),
PERIOD FOR SYSTEM_TIME --定义维表的变化周期。
) with (
type = 'hologres',
dbname = 'itsl_dev',
tablename = 'idmapping_dim',
username = 'XXX',
password = 'XXXXX',
endpoint = 'XXXXXXXcn-.hologres.ops.cloud.gwm.cn:80',
cache = 'LRU',
cacheSize = '10000'
-- async = 'true' --是否异步读取数据
);
-- 报错信息是找不到合适的table factory https://help.aliyun.com/document_detail/194527.html blink独享3.5以下Hologres不兼容,建议您升级至3.7版本使用。
-- 'connector'='hologres',
-- 'dbname'='<yourDbname>', --Hologres的数据库名称。
-- 'tablename'='<yourTablename>', --Hologres用于接收数据的表名称。
-- 'username'='<yourUsername>', --当前阿里云账号的AccessKey ID。
-- 'password'='<yourPassword>', --当前阿里云账号的AccessKey Secret。
-- 'endpoint'='<yourEndpoint>' --当前Hologres实例VPC网络的Endpoint。--创建datahub消息源
create table datahub_source(
oneid VARCHAR,
event VARCHAR
) with (
type = 'datahub',
endPoint = 'https://datahub.cn-res.cloud.gwm.cn',
project = 'itsl',
topic = 'event_info',
subId = '',
accessId = 'XXXX',
accessKey = 'XXXXXXXXXXXXX',
startTime='2022-04-01 12:00:00',--按实际修改
lengthCheck='NONE'
);
--创建结果表
CREATE TABLE print_table (
oneid varchar,
message varchar,
if_exists varchar
) WITH (
type = 'print',
logger = 'true'
);
INSERT INTO print_table
SELECT
if(idmapping_dim.oneid is null,uuid(),idmapping_dim.oneid) as oneid
,da.event as `message`
,if(idmapping_dim.oneid is null,'0','1') if_exists
from datahub_source da
left join idmapping_dim FOR SYSTEM_TIME AS OF PROCTIME()
on idmapping_dim.oneid=da.oneid
;
3、维表关联
INSERT INTO print_table
SELECT
if(idmapping_dim.oneid is null,uuid(),idmapping_dim.oneid) as oneid
,da.event as `message`
,if(idmapping_dim.oneid is null,'0','1') if_exists
from datahub_source da
left join idmapping_dim FOR SYSTEM_TIME AS OF PROCTIME()
on idmapping_dim.oneid=da.oneid
;
说明
维表支持INNER JOIN和LEFT JOIN,不支持RIGHT JOIN或FULL JOIN。
必须加上FOR SYSTEM_TIME AS OF PROCTIME(),表示JOIN维表当前时刻所看到的每条数据。
源表后面进来的数据只会关联当时维表的最新信息,即JOIN行为只发生在处理时间(Processing Time)。如果JOIN行为发生后,维表中的数据发生了变化(新增、更新或删除),则已关联的维表数据不会被同步变化。
ON条件中必须包含维表所有的PRIMARY KEY的等值条件(且要求与真实表定义一致)。此外,ON条件中也可以有其他等值条件。
如果您有一对多JOIN需求,请在维表DDL INDEX中指定关联的KEY,详情请参见INDEX语法。
维表和维表不能进行JOIN。
ON条件中维表字段不能使用CAST等类型转换函数。如果您有类型转换需求,请在源表字段进行操作。
4、数据去向
4.1 MaxcomputeSink
① MaxcomputeSink
--SQL
--********************************************************************--
--Author: admin_sale
--CreateTime: 2021-09-09 13:56:58
--Comment: 请输入业务注释信息
--********************************************************************--
--创建rds维表
create table idmapping_dim (
oneid VARCHAR ,
phone VARCHAR,
idcard VARCHAR,
vin VARCHAR,
name VARCHAR,
plate_num VARCHAR,
weight INT,
ssoid VARCHAR,
uid VARCHAR,
data_date VARCHAR,
PRIMARY KEY (oneid),
PERIOD FOR SYSTEM_TIME--定义了维表的变化周期,即表明该表是⼀张会变化的表。
)
with (
type='rds',
url='jdbc:mysql://10.252.48.**:3306/bigdata_sale',
host = '10.252.48.**',
port = '3306',
dbname = 'bigdata_sale',
tablename = 'idmapping_dim',
username = 'XXX',
password='XXXXXXXX',
-- cache = 'ALL'
cache = 'LRU',
cacheSize = '100000'
);
--创建datahub消息源
create table datahub_source(
oneid VARCHAR,
event VARCHAR
) with (
type = 'datahub',
endPoint = 'https://XXXXX.XXXXXXXXXXX-dh.res.cloud.XX.cn',
project = 'itsl',
topic = 'event_info',
subId = '',
accessId = 'XXXXXXXX',
accessKey = 'XXXXXXXXXXX',
startTime='2021-12-16 12:00:00',--按实际修改
lengthCheck='NONE'
);
--创建结果表
create table print_table(
oneid varchar,
message varchar,
if_exists varchar
) with (
type = 'odps',
endpoint = 'http://service.cn-odps.ops.cloud.gwm.cn/api',
tunnelEndpoint = '',
project = 'itsl_dev',
tablename = 'odps_sink',
accessid = 'dYf574jXtaUGzzw9',
accesskey = 'KaruzJpV8mqlr9wXFTQTiOI3wV10Gw'
);
INSERT INTO print_table
SELECT
if(idmapping_dim.oneid is null,uuid(),idmapping_dim.oneid) as oneid
,da.event as `message`
,if(idmapping_dim.oneid is null,'0','1') if_exists
from datahub_source da
left join idmapping_dim FOR SYSTEM_TIME AS OF PROCTIME()
on idmapping_dim.oneid=da.oneid
;
4.2 HologresSink
②HologresSink
--SQL
--********************************************************************--
--Author: admin_sale
--CreateTime: 2021-09-09 13:56:58
--Comment: 请输入业务注释信息
--********************************************************************--
--创建rds维表
create table idmapping_dim (
oneid VARCHAR ,
phone VARCHAR,
idcard VARCHAR,
vin VARCHAR,
name VARCHAR,
plate_num VARCHAR,
weight INT,
ssoid VARCHAR,
uid VARCHAR,
data_date VARCHAR,
PRIMARY KEY (oneid),
PERIOD FOR SYSTEM_TIME--定义了维表的变化周期,即表明该表是⼀张会变化的表。
)
with (
type='rds',
url='jdbc:mysql://10.252.48.**:3306/bigdata_sale',
host = '10.252.48.**',
port = '3306',
dbname = 'bigdata_sale',
tablename = 'idmapping_dim',
username = 'XXXXXX',
password='XXXXX',
-- cache = 'ALL'
cache = 'LRU',
cacheSize = '100000'
);
--创建datahub消息源
create table datahub_source(
oneid VARCHAR,
event VARCHAR
) with (
type = 'datahub',
endPoint = 'https://datahubXXXXXXcn-dh.res.cloud.XXXXX.cn',
project = 'itsl',
topic = 'event_info',
subId = '',
accessId = 'XXX',
accessKey = 'XXXXXX',
startTime='2021-12-16 12:00:00',--按实际修改
lengthCheck='NONE'
);
--创建结果表 仅适用于Blink 3.6.0及以上版本,如果您的Blink为3.6.0以下的版本,您可以:
create table print_table(
oneid VARCHAR,
message VARCHAR,
if_exists VARCHAR
) with (
type = 'hologres',
dbname = 'itsl_dev',
tablename = 'hologres_sink',
username = 'XXXXXX',
password = 'XXXXXX',
endpoint = 'XXXXXXXX.hologres.ops.cloud.XXX.cn:80'
);
INSERT INTO print_table
SELECT
if(idmapping_dim.oneid is null,uuid(),idmapping_dim.oneid) as oneid
,da.event as `message`
,if(idmapping_dim.oneid is null,'0','1') if_exists
from datahub_source da
left join idmapping_dim FOR SYSTEM_TIME AS OF PROCTIME()
on idmapping_dim.oneid=da.oneid
;
4.3 DatahubSink
③ DatahubSink
--SQL
--********************************************************************--
--Author: admin_sale
--CreateTime: 2021-09-09 13:56:58
--Comment: 请输入业务注释信息
--********************************************************************--
--创建rds维表
create table idmapping_dim (
oneid VARCHAR ,
phone VARCHAR,
idcard VARCHAR,
vin VARCHAR,
name VARCHAR,
plate_num VARCHAR,
weight INT,
ssoid VARCHAR,
uid VARCHAR,
data_date VARCHAR,
PRIMARY KEY (oneid),
PERIOD FOR SYSTEM_TIME--定义了维表的变化周期,即表明该表是⼀张会变化的表。
)
with (
type='rds',
url='jdbc:mysql://10.252.48.**:3306/bigdata_sale',
host = '10.252.48.**',
port = '3306',
dbname = 'bigdata_sale',
tablename = 'idmapping_dim',
username = 'bigdata_sale',
password='faX18l1TuOU2FsdGds',
-- cache = 'ALL'
cache = 'LRU',
cacheSize = '100000'
);
--创建datahub消息源
create table datahub_source(
oneid VARCHAR,
event VARCHAR
) with (
type = 'datahub',
endPoint = 'https://XXXXXX.cn-dh.res.cloud.XXXXXXXXX.cn',
project = 'itsl',
topic = 'event_info',
subId = '',
accessId = 'XXXXXXXXXX',
accessKey = 'XXXXXXXXXXXXX',
startTime='2021-12-16 12:00:00',--按实际修改
lengthCheck='NONE'
);
--创建结果表
create table print_table (
oneid VARCHAR,
message VARCHAR,
if_exists VARCHAR
)with(
type='datahub',
endPoint = 'https://XX.cn-dh.res.cloud.XX.cn',
project = 'itsl',
topic='datahub_sink',
accessId = 'XXXXXX',
accessKey = 'XXXXXXXXXXXX',
batchSize='300',
batchWriteTimeoutMs='1000',
hashFields='oneid' --指定了列名之后,相同列的值会写入到同一个Shard。
);
--创建结果表
-- CREATE TABLE print_table1 (
-- oneid varchar,
-- message varchar,
-- if_exists varchar
-- ) WITH (
-- type = 'print',
-- logger = 'true'
-- );
INSERT INTO print_table
SELECT
if(idmapping_dim.oneid is null,uuid(),idmapping_dim.oneid) as oneid
,da.event as `message`
,if(idmapping_dim.oneid is null,'0','1') if_exists
from datahub_source da
left join idmapping_dim FOR SYSTEM_TIME AS OF PROCTIME()
on idmapping_dim.oneid=da.oneid
;
-- INSERT INTO print_table1
-- SELECT
-- if(idmapping_dim.oneid is null,uuid(),idmapping_dim.oneid) as oneid
-- ,da.event as `message`
-- ,if(idmapping_dim.oneid is null,'0','1') if_exists
-- from datahub_source da
-- left join idmapping_dim FOR SYSTEM_TIME AS OF PROCTIME()
-- on idmapping_dim.oneid=da.oneid
-- ;
5、自定义函数
package com.alibaba.blink.udx;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.java.tuple.Tuple4;
import org.apache.flink.table.functions.TableFunction;
import org.apache.flink.table.types.DataType;
import org.apache.flink.table.types.DataTypes;
import org.apache.flink.types.Row;
import java.math.BigInteger;
/**
* @author yangyingchun
* @version 1.0
* @date 2021/12/16 18:14
*/
// 这是返回值
public class ParseJson extends TableFunction<Tuple4<String,String,String,Long>> {
//入参 json字段名称,解析的json key
public void eval(String jsons,String key) {
if (null!=jsons){
JSONObject jsonObject = JSON.parseObject(jsons);
//"behaviorDatas"
JSONArray jsonArray = (JSONArray) jsonObject.get(key);
for(Object js:jsonArray){
//JSONObject json = JSON.parseObject((String) js);
Row row = new Row(4);
JSONObject json = (JSONObject) js;
String event = (String) json.get("event");
String eventType = (String) json.get("eventType");
String properties = JSON.toJSONString(json.get("properties"));
long startTime = (long) json.get("startTime");
row.setField(0,event);
row.setField(1,eventType);
row.setField(2,properties);
row.setField(3,startTime);
Tuple4<String,String,String,Long> tuple4 = new Tuple4<String,String,String,Long>(
event,eventType,properties,startTime
);
collect(tuple4);
}
}
}
//这个方法很重要,不写的话会报错!!!! 返回值是row 必须写这个重载方法
//SQL validation failed. From line 1, column 89 to line 1, column 111: List of column aliases must have same degree as table; table has 1 columns ('f0'), whereas alias list has 3 columns
//at org.apache.flink.table.planner.calcite.FlinkPlannerImpl.validate(FlinkPlannerImpl.scala:125)
@Override
public DataType getResultType(Object[] arguments, Class[] argTypes) {
return DataTypes.createTupleType(DataTypes.STRING, DataTypes.STRING, DataTypes.STRING,DataTypes.LONG);
}
}
--SQL
--********************************************************************--
--Author: admin_sale
--CreateTime: 2021-12-17 09:28:00
--Comment: 请输入业务注释信息
--********************************************************************--
CREATE FUNCTION ParseJS AS 'com.alibaba.blink.udx.ParseJson';
--创建datahub消息源
create table datahub_source(
oneid VARCHAR,
event VARCHAR
) with (
type = 'datahub',
endPoint = 'https://XXXXXXX.cn-dh.res.cloud.XXXXXXX.cn',
project = 'itsl',
topic = 'event_info',
subId = '',
accessId = 'XXXXX',
accessKey = 'XXXXXXXXXXXXX',
startTime='2021-12-17 00:00:00',--按实际修改
lengthCheck='NONE'
);
-- create view v1 as
-- select event1,eventType,properties,startTime
-- from datahub_source,
-- lateral table(ParseJS(event,'behaviorDatas')) as T (event1,eventType,properties,startTime)
-- ;
-- 创建结果表
CREATE TABLE print_table (
oneid varchar,
event1 varchar,
eventType varchar,
_td_current_province varchar,
_td_current_city varchar,
properties varchar,
startTime BIGINT
) WITH (
type = 'print',
logger = 'true'
);
INSERT INTO print_table
SELECT
oneid
,event1
,eventType
,JSON_VALUE(T.properties,'$._td_current_province') as _td_current_province
,JSON_VALUE(T.properties,'$._td_current_city') AS _td_current_city
,properties
,startTime
from
datahub_source as S,
lateral table(ParseJS(event,'behaviorDatas')) as T(event1,eventType,properties,startTime)
;
6、DataStreamAPI
BLINK-DataStream开发_大数据00的博客-CSDN博客
--完整主类名,必填,例如com.alibaba.realtimecompute.DatastreamExample
blink.main.class=com.alibaba.blink.demo.stream_demo
--包含完整主类名的JAR包资源名称,多个JAR包时必填,例如blink_datastream.jar
--blink.main.jar=${完整主类名jar包的资源名称}
--默认state backend配置,当作业代码没有显式声明时生效
state.backend.type=niagara
state.backend.niagara.ttl.ms=129600000
--默认Checkpoint配置,当作业代码没有显式声明时生效
blink.checkpoint.interval.ms=180000
--默认启用项目参数
--disable.project.config=false
--设置自定义参数,代码中获取自定义参数的方法请参考如下链接:
--https://help.aliyun.com/document_detail/127758.html?spm=a2c4g.11174283.6.677.61fb1e49NJoWTR
-- 3.2及以上版本开启window miniBatch方法(3.2及以上版本默认不开启window miniBatch)。
-- sql.exec.mini-batch.window.enabled=true
-- -- excatly-once语义。
-- blink.checkpoint.mode=EXACTLY_ONCE
-- -- checkpoint间隔时间,单位毫秒。
-- blink.checkpoint.interval.ms=180000
-- blink.checkpoint.timeout.ms=600000
-- -- 实时计算Flink版2.0及以上版本使用niagara作为statebackend,以及设定state数据生命周期,单位毫秒。
-- state.backend.type=niagara
-- state.backend.niagara.ttl.ms=129600000
-- -- 实时计算Flink版2.0及以上版本开启5秒的microbatch(窗口函数不需要设置该参数)。
-- blink.microBatch.allowLatencyMs=5000
-- -- 表示整个Job允许的延迟。
-- blink.miniBatch.allowLatencyMs=5000
-- -- 双流join节点优化参数。
-- blink.miniBatch.join.enabled=true
-- -- 单个Batch的size。
-- blink.miniBatch.size=20000
-- -- local优化,实时计算Flink版2.0及以上版本默认已经开启,1.6.4版本需要手动开启。
-- blink.localAgg.enabled=true
-- -- 实时计算Flink版2.0及以上版本开启partial优化,解决count distinct效率低问题。
-- blink.partialAgg.enabled=true
-- -- union all优化。
-- blink.forbid.unionall.as.breakpoint.in.subsection.optimization=true
-- -- GC优化(源表为SLS时,不能设置该参数)。
-- blink.job.option=-yD heartbeat.timeout=180000 -yD env.java.opts='-verbose:gc -XX:NewRatio=3 -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:ParallelGCThreads=4'
-- -- 时区设置。
-- blink.job.timeZone=Asia/Shanghai