Flink 离线批处理 Mysql to Hive 离线数仓

aaaak_

已于 2022-10-08 08:52:24 修改

阅读量1.5k

点赞数

分类专栏：大数据文章标签： hive flink mysql

于 2022-09-30 17:07:47 首次发布

本文链接：https://blog.csdn.net/u010020726/article/details/127123926

版权

大数据专栏收录该内容

14 篇文章 1 订阅

订阅专栏

flink mysql to hive

我本地用的版本是 flink 1.13.2 scala 2.11 , hive 3.1.2

在这里插入图片描述

环境配置

1, source 端读取mysql 导入FLINK_HOME/lib ,flink-connector-jdbc.jar
2 .sink 端写入 Hive , 导入FLINK_HOME/lib ,flink-sql-connector-hive
注意如果没有找到自己对应版本的 flink hive 包的话就导入下面的包

# flink hive 连接器
flink-connector-hive_2.11-1.13.6.jar
# hive 的依赖
hive-exec-3.1.0.jar
#  Libfb303  在某些版本中没有打包到hive-exec中，需要单独添加它
libfb303-0.9.3.jar 
# 如果 你 需要 在 flink 里面用hive 语法sql ,就需要添加此依赖 SET table.sql-dialect=hive;
antlr-runtime-3.5.2.jar

配置 flink catalog hive

下面距离 flink sql 模式下的操作

./bin/sql-client.sh
## 默认只有一个 基于内存的 catalog;
show catalogs; 
## 添加一个 hive 的 catalogs; 需要指定 hive 的一个默认库,和hive 的配置文件地址 在hive-env.sh里面的HIVE_CONF_DIR
CREATE CATALOG hive_catalog WITH (
    'type' = 'hive',
    'default-database' = 'default',
    'hive-conf-dir' = '/srv/udp/2.0.0.0/hive/conf/'
);
# 再次查看 catalog 发现多了一个 hive_catalogs;
show catalogs; 

+-----------------+
|    catalog name |
+-----------------+
| default_catalog |
|    hive_catalog |
+-----------------+
# 选用 hive_catalogs
use catalog hive_catalog;
# 查看 catalog 下有哪些库
show databases;

+----------------+
|  database name |
+----------------+
|    ads_demo_xl |
|        default |
|           demo |
|    dim_demo_xl |
| dim_order_test |
|    dwd_demo_xl |
| dwd_order_test |
|    dws_demo_xl |
| dws_order_test |
|  flink_catalog |
|    ods_demo_xl |
| ods_order_test |
|           test |
+----------------+

# use databases;查看下面有哪些表
show tables;

+-----------------+
|      table name |
+-----------------+
| mytest_deptaddr |
| ods_goods_order |
+-----------------+
# 查看 其中一个表里面的数据
select * from hive_catalog.demo.mytest_deptaddr;

创建 flink Source mysql

# lookup.cache 是设置 缓存
# scan.partition 是分区 并行 查找 相当于 where id between 0 and 50;
CREATE TABLE test_cdc (
                                      id INT,
                                      name STRING,
                                      gender STRING,
                                      score DOUBLE,
                                      primary key(id) NOT ENFORCED
) WITH (
     'connector' = 'jdbc',
     'url' = 'jdbc:mysql://192.168.12.212:3307/big-data?useUnicode=true&characterEncoding=UTF-8&serverTimezone=Asia/Shanghai',
     'table-name' = 'test_cdc',
     'username' = 'root',
     'password' = 'root',
     'lookup.cache.ttl' = '3600s',
     'lookup.cache.max-rows' = '10000',
     'scan.partition.column' = 'id',
     'scan.partition.num' = '3',
     'scan.partition.lower-bound' = '0',
     'scan.partition.upper-bound' = '9999999999'
--    ,  'sink.parallelism' = '3'
 );

# 查询下 表 看是否创建成功
select * from test_cdc;

创建 Flink sink Hive

// Module 是用于 UDF 扩展
load module hive;
use modules hive,core;

# 启用 hive 方言模式 
SET table.sql-dialect=hive;
# 建立兼容表，如果不用 方言模式 间的表 在 hive 客户端是打不开的，只能在flink 里面打开
create  table mytest_deptaddr
(
    dept_no     int,
    addr        string,
    tel         string,
    statis_date string
)TBLPROPERTIES (
    'streaming-source.enable' = 'false',           
    'streaming-source.partition.include' = 'all',  
    'lookup.join.cache.ttl' = '12 h',
    'sink.partition-commit.policy.kind' = 'metastore,success-file'
    
);

select * from hive_catalog.flink_catalog.mytest_deptaddr;

读取 mysql 写入 hive

# 覆盖 insert 只能 批模式
set execution.runtime-mode=BATCH;

insert OVERWRITE   hive_catalog.flink_catalog.mytest_deptaddr
select id as dept_no,name as addr,gender as tel,'测试' as statis_date from hive_catalog.flink_catalog.test_cdc;