kafka source
CREATE TABLE `kafka_log`.`kafka_topic_log` (
`common` ROW<ar STRING,ba STRING,ch STRING,is_new STRING,md STRING,mid STRING,os STRING,uid STRING,vc STRING>,
`page` ROW<during_time STRING,item STRING,item_type STRING,last_page_id STRING,page_id STRING,source_type STRING> ,
`actions` ARRAY<ROW<action_id STRING,item STRING,item_type STRING,ts BIGINT>>,
`displays` ARRAY<ROW<display_type STRING,item STRING,item_type STRING,`order` STRING,pos_id STRING>>,
`start` ROW<entry STRING,loading_time BIGINT,open_ad_id BIGINT,open_ad_ms BIGINT,open_ad_skip_ms BIGINT>,
`err` ROW<error_code BIGINT,msg STRING>,
`ts` BIGINT
) WITH (
'connector' = 'kafka',
'topic' = 'topic_log',
'properties.bootstrap.servers' = 'hadoop1:9092,hadoop1:9092,hadoop1:9092',
'properties.group.id' = 'hudi_source',
'scan.startup.mode' = 'earliest-offset',
'format' = 'json',
--如果json缺少filed是否报错
'json.fail-on-missing-field'='false',
--是否无视json转换时的报错
'json.ignore-parse-errors' = 'true'
);
hudi+hive同步
CREATE TABLE `hudi_ods`.`ods_log` (
`uuid` STRING,
`common` ROW<ar STRING,ba STRING,ch STRING,is_new STRING,md STRING,mid STRING,os STRING,uid STRING,vc STRING>,
`page` ROW<during_time STRING,item STRING,item_type STRING,last_page_id STRING,page_id STRING,source_type STRING> ,
`actions` ARRAY<ROW<action_id STRING,item STRING,item_type STRING,ts BIGINT>>,
`displays` ARRAY<ROW<display_type STRING,item STRING,item_type STRING,`order` STRING,pos_id STRING>>,
`start` ROW<entry STRING,loading_time BIGINT,open_ad_id BIGINT,open_ad_ms BIGINT,open_ad_skip_ms BIGINT>,
`err` ROW<error_code BIGINT,msg STRING>,
`ts` BIGINT,
`dt` STRING,
`t` as TO_TIMESTAMP(FROM_UNIXTIME(ts/1000,'yyyy-MM-dd HH:mm:ss')),
WATERMARK FOR `t` AS `t` - INTERVAL '5' SECOND
)
PARTITIONED BY (`dt`)
WITH (
'connector'='hudi',
'path' ='hdfs://hadoop1:8020/user/hudi/warehouse/hudi_ods/ods_log',
'table.type'='MERGE_ON_READ',
--uuid() 是内置函数,因为日志表没有主键字段。在这里设置唯一主键
'hoodie.datasource.write.recordkey.field' = 'uuid',
--预聚合,没啥用
'hoodie.datasource.write.precombine.field' = 'ts',
--并发参数,修改值修改写出文件数
'write.bucket_assign.tasks'='1',
--并发写:这里设为1减少资源占用
'write.tasks' = '1',
'compaction.tasks' = '1',
--异步聚合
'compaction.async.enabled' = 'true',
--计划聚合
'compaction.schedule.enabled' = 'true',
--策略:次数
'compaction.trigger.strategy' = 'num_commits',
--5次
'compaction.delta_commits' = '5',
--开启流读取
'read.streaming.enabled' = 'true',
--跳过compaction ,避免重复消费
'read.streaming.skip_compaction' = 'true',
--开启hive同步
'hive_sync.enable'='true',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://hadoop1:9083',
'hive_sync.db'='hive_ods',
'hive_sync.table'='ods_log'
);
插表时:当把null值写入flink的notnull列时
set table.exec.sink.not-null-enforcer=drop;
这个参数可以设置删除,默认是error报错;