一、Datax 数据导出
### --- DataX数据导出
~~~ 官方配置地址:https://github.com/alibaba/DataX/blob/master/introduction.md)
二、DataX从Hive下导入表到mysql下
### --- 在mysql下建表,准备导出数据的数据库
~~~ ADS有4张表需要从数据仓库的ADS层导入MySQL,即:Hive => MySQL
~~~ 次实验以ads.ads_member_active_count从hdfs导出到mysql为准,
~~~ 其它表中的数据参考以上案例完成。
ads.ads_member_active_count
ads.ads_member_retention_count
ads.ads_member_retention_rate
ads.ads_new_member_cnt
### --- 建表语法:MySQL 建表
~~~ # 活跃会员数
create database dwads;
drop table if exists dwads.ads_member_active_count;
create table dwads.ads_member_active_count(
`dt` varchar(10) COMMENT '统计日期',
`day_count` int COMMENT '当日会员数量',
`week_count` int COMMENT '当周会员数量',
`month_count` int COMMENT '当月会员数量',
primary key (dt)
);
~~~ # 新增会员数
drop table if exists dwads.ads_new_member_cnt;
create table dwads.ads_new_member_cnt
(
`dt` varchar(10) COMMENT '统计日期',
`cnt` int,
primary key (dt)
);
~~~ # 会员留存数
drop table if exists dwads.ads_member_retention_count;
create table dwads.ads_member_retention_count
(
`dt` varchar(10) COMMENT '统计日期',
`add_date` varchar(10) comment '新增日期',
`retention_day` int comment '截止当前日期留存天数',
`retention_count` bigint comment '留存数',
primary key (dt)
) COMMENT '会员留存情况';
~~~ # 会员留存率
drop table if exists dwads.ads_member_retention_rate;
create table dwads.ads_member_retention_rate
(
`dt` varchar(10) COMMENT '统计日期',
`add_date` varchar(10) comment '新增日期',
`retention_day` int comment '截止当前日期留存天数',
`retention_count` bigint comment '留存数',
`new_mid_count` bigint comment '当日会员新增数',
`retention_ratio` decimal(10,2) comment '留存率',
primary key (dt)
) COMMENT '会员留存率';
三、创建json文件,进行数据导入:hdfsreader => mysqlwriter
~~~ 创建json文件,进行数据导入:hdfsreader => mysqlwriter
~~~ 导出活跃会员数(ads_member_active_count)
export_member_active_count.json
### --- 准备资源数据
~~~ # hive库中ads.ads_member_active_count在hdfs下的数据地址;
hive (default)> desc formatted ads.ads_member_active_count;
~~~输出参数
Location: hdfs://hadoop01:9000/user/hive/warehouse/ads.db/ads_member_active_count
### --- 创建json文件
[root@hadoop02 ~]# vim /data/yanqidw/script/member_active/export_member_active_count.json
{
"job": {
"setting": {
"speed": {
"channel": 1
}
},
"content": [
{
"reader": {
"name": "hdfsreader",
"parameter": {
"path": "/user/hive/warehouse/ads.db/ads_member_active_count/dt=$do_date/*",
"defaultFS": "hdfs://hadoop01:9000",
"column": [
{
"type": "string",
"value": "$do_date"
},
{
"index": 0,
"type": "string"
},
{
"index": 1,
"type": "string"
},
{
"index": 2,
"type": "string"
}
],
"fileType": "text",
"encoding": "UTF-8",
"fieldDelimiter": ","
}
},
"writer": {
"name": "mysqlwriter",
"parameter": {
"writeMode": "replace",
"username": "hive",
"password": "12345678",
"column": [
"dt",
"day_count",
"week_count",
"month_count"
],
"preSql": [
""
],
"connection": [
{
"jdbcUrl": "jdbc:mysql://hadoop05:3306/dwads?useUnicode=true&characterEncoding=utf-8",
"table": [
"ads_member_active_count"
]
}
]
}
}
}
]
}
}
### --- 执行该json文件
[root@hadoop02 ~]# python $DATAX_HOME/bin/datax.py -p "-Ddo_date=2021-09-29" \
/data/yanqidw/script/member_active/export_member_active_count.json
~~~输出参数
2021-10-01 13:47:30.918 [job-0] INFO JobContainer -
任务启动时刻 : 2021-10-01 13:47:17
任务结束时刻 : 2021-10-01 13:47:30
任务总计耗时 : 13s
任务平均流量 : 2B/s
记录写入速度 : 0rec/s
读出记录总数 : 1
读写失败总数 : 0
四、创建传输脚本文件
### --- 创建传输脚本文件
~~~ # 配置json文件的时间传输脚本:export_member_active_count.sh
[root@hadoop02 ~]# vim /data/yanqidw/script/member_active/export_member_active_count.sh
#!/bin/bash
JSON=/data/yanqidw/script/member_active/
source /etc/profile
if [ -n "$1" ] ;
then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
python $DATAX_HOME/bin/datax.py -p "-Ddo_date=$do_date" $JSON/export_member_active_count.json
~~~ # 执行该脚本
[root@hadoop02 ~]# chmod +x /data/yanqidw/script/member_active/export_member_active_count.sh
[root@hadoop02 ~]# /data/yanqidw/script/member_active/export_member_active_count.sh
~~~输出参数
2021-10-01 14:35:44.221 [job-0] INFO JobContainer -
任务启动时刻 : 2021-10-01 14:35:30
任务结束时刻 : 2021-10-01 14:35:44
任务总计耗时 : 13s
任务平均流量 : 2B/s
记录写入速度 : 0rec/s
读出记录总数 : 1
读写失败总数 : 0
~~~ # 或者指定特定时间
[root@hadoop02 ~]# /data/yanqidw/script/member_active/export_member_active_count.sh 2021-09-29
~~~输出参数
2021-10-01 14:36:54.879 [job-0] INFO JobContainer -
任务启动时刻 : 2021-10-01 14:36:41
任务结束时刻 : 2021-10-01 14:36:54
任务总计耗时 : 13s
任务平均流量 : 2B/s
记录写入速度 : 0rec/s
读出记录总数 : 1
读写失败总数 : 0
四、验证数据导出
### --- 验证mysql是否从hdfs下写入数据
mysql> select * from dwads.ads_member_active_count;
+------------+-----------+------------+-------------+
| dt | day_count | week_count | month_count |
+------------+-----------+------------+-------------+
| 2021-09-29 | 11000 | 11000 | 11000 |
| 2021-09-30 | 9999 | 18999 | 18999 |
+------------+-----------+------------+-------------+