streamX 平台采用flink sql 读取 kafka 的数据写入 hive

fyl005

已于 2022-05-24 10:02:00 修改

阅读量2k

点赞数 1

分类专栏： streamx flinkSql hive 文章标签： hive kafka flink sql

于 2022-05-13 11:13:23 首次发布

本文链接：https://blog.csdn.net/zuixue_lei/article/details/124747608

版权

flinkSql 同时被 3 个专栏收录

8 篇文章 0 订阅

订阅专栏

hive

8 篇文章 0 订阅

订阅专栏

streamx

6 篇文章 0 订阅

订阅专栏

一、查询 flink 官网，找寻写hive 的信息。

信息找寻步骤：

在 “概览” 中，有连接 hive 的信息：

二、实操

1、 hive 中创建建表语句。

2、streamx 中写数据到 hive 中。

3、sink hive 时，在 source 端加上 watermark 可以不用msck repair table 修复分区。

组件：streamX、flink 1.12、hive 2.x

一、查询 flink 官网，找寻写hive 的信息。

Apache Flink 1.12 Documentation: Hive

信息找寻步骤：

1、先找到 flink 连接外部信息的目录。

2、因为我们是要采用 flink sql 的方式，所以要看 table&sql。

3、找到 hive。

4、先看概览。

在 “概览” 中，有连接 hive 的信息：

1、首先，注册 catalog 信息。

catalog 是一些元数据信息。Apache Flink 1.12 Documentation: Catalogs

2、这里有关于 catalog sql 的创建方式。

CREATE CATALOG myhive WITH (
    'type' = 'hive',
    'default-database' = 'mydatabase',
    'hive-conf-dir' = '/opt/hive-conf'
);
-- set the HiveCatalog as the current catalog of the session
USE CATALOG myhive;

3、hive 中创建表

Apache Flink 1.12 Documentation: Hive Read & Write

SET table.sql-dialect=hive;
CREATE TABLE hive_table (
  user_id STRING,
  order_amount DOUBLE
) PARTITIONED BY (dt STRING, hr STRING) STORED AS parquet TBLPROPERTIES (
  'partition.time-extractor.timestamp-pattern'='$dt $hr:00:00',
  'sink.partition-commit.trigger'='partition-time',
  'sink.partition-commit.delay'='1 h',
  'sink.partition-commit.policy.kind'='metastore,success-file'
);

SET table.sql-dialect=default;
CREATE TABLE kafka_table (
  user_id STRING,
  order_amount DOUBLE,
  log_ts TIMESTAMP(3),
  WATERMARK FOR log_ts AS log_ts - INTERVAL '5' SECOND
) WITH (...);

-- streaming sql, insert into hive table
INSERT INTO TABLE hive_table 
SELECT user_id, order_amount, DATE_FORMAT(log_ts, 'yyyy-MM-dd'), DATE_FORMAT(log_ts, 'HH')
FROM kafka_table;

查询配置信息解读：

Apache Flink 1.12 Documentation: FileSystem SQL Connector

二、实操

1、 hive 中创建建表语句。

CREATE EXTERNAL TABLE `stg_tdts_station_vehicle_sign_filter_1`(
  `msg_id` string, 
  `vehicle_sign_id` string, 
  `station_id` string, 
  `lane_num` string, 
  `vehicle_id` string, 
  `vehicle_id_color` bigint, 
  `tail_vehicle_id` string, 
  `veh_speed` bigint, 
  `identify_type` bigint, 
  `veh_feature_code` string, 
  `face_feature_code` string, 
  `op_time` string, 
  `direction` bigint, 
  `identify_licence_code` string, 
  `identify_licence_color` bigint)
COMMENT '?????????'
PARTITIONED BY ( 
  `dt` string)
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
  'hdfs://bigdata-1:8020/apps/hive/warehouse/stg_tdts_station_vehicle_sign_filter_1'
TBLPROPERTIES (
  'partition.time-extractor.timestamp-pattern'='$dt 00:00:00', 
  'sink.partition-commit.delay'='0s', 
  'sink.partition-commit.policy.kind'='metastore,success-file', 
  'sink.partition-commit.trigger'='partition-time')

2、streamx 中写数据到 hive 中。


-- 处理逻辑: 从 kafka 中读取 出口数据 分别写入到 出口 etc 和其他
-- step1: ...
-- step2: ...
-- step3: ...
-- step4: ...


CREATE CATALOG myhive WITH (
    'type' = 'hive',
    'default-database' = 'dw_shaoxing',
    'hive-conf-dir' = '/usr/hdp/2.6.5.0-292/hive/conf/'
);


-- ** source 端 读取kafka数据

CREATE TABLE if not exists kafka_input_trc_viu_topic(
        vehSpeed STRING COMMENT '',
        producerTime STRING COMMENT '',
        opTime STRING COMMENT '',
        faceFeatureCode STRING COMMENT '',
        msgId STRING COMMENT '',
        id STRING COMMENT '',
        vehFeatureCode STRING COMMENT '',
        vehicleId STRING COMMENT '',
        laneNum STRING COMMENT '',
        direction STRING COMMENT '',

        `proctime` as PROCTIME() -- 通过计算列产生一个处理时间列
)
WITH (
    'connector' = 'kafka',
    'topic' = 'TRC_VIU_TOPIC_4',
    'properties.bootstrap.servers' = 'ip:8092,ip:8092',
    'properties.group.id' = 'kafka_input_trc_viu_topic_feng_3',
    'scan.startup.mode' = 'earliest-offset',
    'format' = 'json'
  );

-- sink 端 hive  ETC 和 其他 两张表


-- ** sink 端
-- 2、需要在 hive 中创建表

SET table.sql-dialect=hive;


-- 写入到 hive 中的入口原始表中
-- 需要根据 pass_time 设置 dt 分区

-- 收费站牌识表
INSERT INTO `myhive`.dw_shaoxing.stg_tdts_station_vehicle_sign_filter_1
select
    msgId as msg_id,
    id as vehicle_sign_id,
    substring(id, 1, 14) as station_id,
    laneNum as lane_num,

    split_index(vehicleId , '_', 0) as vehicle_id, -- vehicleId 截取
    cast(split_index(vehicleId , '_', 1) as bigint) as vehicle_id_color,

      -- topic 没有该字段
    '' as tail_vehicle_id,
    cast(vehSpeed as bigint) as veh_speed,
    -- 没有 identifyType 字段
    --cast(if(identifyType is null, null, identifyType) as bigint) as identify_type,
    -1 as identify_type,
    '' as veh_feature_code,

    '' as face_feature_code,
    --切割拼接
    concat_ws(' ', substring(opTime, 1, 10), substring(opTime, 12, 8)) as op_time,

    cast(direction as bigint) as direction,

    '' as identify_licence_code,
    -1 as identify_licence_color,
    substring(opTime, 1, 10) as dt

from kafka_input_trc_viu_topic



;