DataX数据同步测试
一.MySQL全量数据同步
1.MySQL建表
-- ambari03.devinkim.com:3306/bitestdb root 123456
USE `bitestdb`;
DROP TABLE IF EXISTS `emp`;
CREATE TABLE `emp` (
`id` int(11) DEFAULT NULL,
`name` varchar(100) DEFAULT NULL,
`deg` varchar(100) DEFAULT NULL,
`salary` int(11) DEFAULT NULL,
`dept` varchar(10) DEFAULT NULL,
`create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
`update_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
`is_delete` bigint(20) DEFAULT '1'
);
insert into `emp`(`id`,`name`,`deg`,`salary`,`dept`,`create_time`,`update_time`,`is_delete`) values (1201,'gopal','manager',50000,'TP','2018-06-17 18:54:32','2019-01-17 11:19:32',1),(1202,'manishahello','Proof reader',50000,'TPP','2018-06-15 18:54:32','2018-06-17 18:54:32',0),(1203,'khalillskjds','php dev',30000,'AC','2018-06-17 18:54:32','2019-03-14 09:18:27',1),(1204,'prasanth_xxx','php dev',30000,'AC','2018-06-17 18:54:32','2019-04-07 09:09:24',1),(1205,'kranthixxx','admin',20000,'TP','2018-06-17 18:54:32','2018-12-08 11:50:33',0),(1206,'garry','manager',50000,'TPC','2018-12-10 21:41:09','2018-12-10 21:41:09',1),(1207,'oliver','php dev',2000,'AC','2018-12-15 13:49:13','2018-12-15 13:49:13',1),(1208,'hello','phpDev',200,'TP','2018-12-16 09:41:48','2018-12-16 09:41:48',1),(1209,'ABC','HELLO',300,NULL,'2018-12-16 09:42:04','2018-12-16 09:42:24',1),(1210,'HELLO','HELLO',5800,'TP','2019-01-24 09:02:43','2019-01-24 09:02:43',1),(1211,'WORLD','TEST',8800,'AC','2019-01-24 09:03:15','2019-01-24 09:03:15',1),(1212,'sdfs','sdfsdf',8500,'AC','2019-03-13 22:01:38','2019-03-13 22:01:38',1),(1213,NULL,'sdfsdf',9800,'sdfsdf','2019-03-14 09:08:31','2019-03-14 09:08:54',1),(1214,'xxx','sdfsdf',9500,NULL,'2019-03-14 09:13:32','2019-03-14 09:13:44',0),(1215,'sdfsf','sdfsdfsdf',9870,'TP','2019-04-07 09:10:39','2019-04-07 09:11:18',0),(1216,'hello','HELLO',5600,'AC','2019-04-07 09:37:05','2019-04-07 09:37:05',1),(1217,'HELLO2','hello2',7800,'TP','2019-04-07 09:37:40','2019-04-07 09:38:17',1);
2.Hive建表
sudo -u hdfs hdfs dfs -rmr /warehouse/tablespace/external/hive/whdb.db/emp/*
drop table if exists `whdb`.`emp`;
create external table `whdb`.`emp` (
id int,
name string,
deg string,
salary double,
dept string,
create_time timestamp,
update_time timestamp,
isdeleted string
) row format delimited fields terminated by '\u0001' STORED AS textfile;
3.开发datax配置文件
# 服务器:flink02.devinkim.com 目录:/devinkim/softwares/datax/datax/job
sudo -u hdfs hdfs dfs -rmr /warehouse/tablespace/external/hive/whdb.db/emp/*
cd /devinkim/softwares/datax/datax/job
vim mysql2hive.json
特别注意:hive存在hdfs上的数据默认是以’\001’分隔的(用vim打开文件会看到是以^A分隔)。所以构建任务书分隔符要指定为 “fieldDelimiter”: “\u0001”!!!
{
"job": {
"setting": {
"speed": {
"channel":1
}
},
"content": [
{
"reader": {
"name": "mysqlreader",
"parameter": {
"username": "root",
"password": "123456",
"connection": [
{
"querySql": [
"select * from emp where id < 1208;"
],
"jdbcUrl": [
"jdbc:mysql://ambari03.devinkim.com:3306/bitestdb"
]
}
]
}
},
"writer": {
"name": "hdfswriter",
"parameter": {
"defaultFS": "hdfs://ambari01.devinkim.com:8020",
"fileType": "text",
"path": "/warehouse/tablespace/external/hive/whdb.db/emp",
"fileName": "emp",
"column": [
{
"name": "id",
"type": "INT"
},
{
"name": "name",
"type": "STRING"
},
{
"name": "deg",
"type": "STRING"
},
{
"name": "salary",
"type": "DOUBLE"
},
{
"name": "dept",
"type": "STRING"
},
{
"name": "create_time",
"type": "TIMESTAMP"
},
{
"name": "update_time",
"type": "TIMESTAMP"
},
{
"name": "isdeleted",
"type": "STRING"
}
],
"writeMode": "append",
"fieldDelimiter": "\u0001"
}
}
}
]
}
}
4.启动DataX运行任务
cd /devinkim/softwares/datax/datax
python bin/datax.py job/mysql2hive.json
5.查看数据
select * from `whdb`.`emp`;
二.MySQL增量、分区数据同步
1.MySQL建表
-- ambari03.devinkim.com:3306/bitestdb root 123456
USE `bitestdb`;
DROP TABLE IF EXISTS `emp`;
CREATE TABLE `emp` (
`id` int(11) DEFAULT NULL,
`name` varchar(100) DEFAULT NULL,
`deg` varchar(100) DEFAULT NULL,
`salary` int(11) DEFAULT NULL,
`dept` varchar(10) DEFAULT NULL,
`create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
`update_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
`is_delete` bigint(20) DEFAULT '1'
);
INSERT INTO `emp` VALUES (1001,'gopal','manager',50000,'TP','2020-09-14 13:51:57','2020-09-14 13:52:42',1);
INSERT INTO `emp` VALUES (1002,'hdfs','hadoop',20000,'TP','2020-09-14 13:51:57','2020-09-14 13:52:42',1);
2.Hive建表
-- sudo -u hdfs hdfs dfs -rmr /warehouse/tablespace/external/hive/whdb.db/emp/*
drop table if exists `whdb`.`emp`;
create external table `whdb`.`emp` (
id int,
name string,
deg string,
salary int,
dept string,
create_time timestamp,
update_time timestamp,
isdeleted bigint
)
partitioned by (day string)
row format delimited fields terminated by '\u0001'
stored as orc
location 'hdfs://ambari01.devinkim.com:8020/warehouse/tablespace/external/hive/whdb.db/emp';
show create table `whdb`.`emp`;
3.开发datax配置文件
increment-partition.json
{
"job": {
"setting": {
"speed": {
"channel": 3,
"byte": 1048576
},
"errorLimit": {
"record": 0,
"percentage": 0.02
}
},
"content": [
{
"reader": {
"name": "mysqlreader",
"parameter": {
"username": "root",
"password": "123456",
"column": [
"`id`",
"`name`",
"`deg`",
"`salary`",
"`dept`",
"`create_time`",
"`update_time`",
"`is_delete`"
],
"where": "`create_time` >= '${start_time}' AND `create_time` < '${end_time}'",
"splitPk": "",
"connection": [
{
"table": [
"emp"
],
"jdbcUrl": [
"jdbc:mysql://ambari03.devinkim.com:3306/bitestdb"
]
}
]
}
},
"writer": {
"name": "hdfswriter",
"parameter": {
"defaultFS": "hdfs://ambari01.devinkim.com:8020",
"fileType": "orc",
"path": "/warehouse/tablespace/external/hive/whdb.db/emp/${day}",
"fileName": "ambari03-bitestdb-emp",
"writeMode": "append",
"fieldDelimiter": "\u0001",
"column": [
{
"name": "id",
"type": "int"
},
{
"name": "name",
"type": "string"
},
{
"name": "deg",
"type": "string"
},
{
"name": "salary",
"type": "double"
},
{
"name": "dept",
"type": "string"
},
{
"name": "create_time",
"type": "timestamp"
},
{
"name": "update_time",
"type": "timestamp"
},
{
"name": "isdeleted",
"type": "string"
}
]
}
}
}
]
}
}
注意:where条件的两种写法
-- 1.使用数据库的时间(不推荐)
`create_time` >= DATE_FORMAT( DATE_SUB(CURDATE(), INTERVAL 1 DAY), '%Y-%m-%d 00:00:00') AND `create_time` < DATE_FORMAT(NOW(),'%Y-%m-%d 00:00:00')
-- 2.使用datax动态传参,传入起始时间(强烈推荐)
`create_time` >= '${start_time}' AND `create_time` < '${end_time}'
4.启动DataX运行任务
# sudo -u hdfs hdfs dfs -rmr /warehouse/tablespace/external/hive/whdb.db/emp/*
# sudo -u hdfs hdfs dfs -mkdir -p /warehouse/tablespace/external/hive/whdb.db/emp/$(date '+%Y-%m-%d' --date="-1 day")
python bin/datax.py job/increment-partition.json -p "-Dday='$(date '+%Y-%m-%d' --date="-1 day")' -Dstart_time='$(date '+%Y-%m-%d 00:00:00' --date="-1 day")' -Dend_time='$(date '+%Y-%m-%d 00:00:00')'"
5.加载数据至分区
hivevariable.hql
use whdb;
load data inpath '/warehouse/tablespace/external/hive/whdb.db/emp/${day}' into table `whdb`.`emp` partition (day='${day}');
select * from emp where day = '${day}';
shell命令传参并运行hivevariable.hql
hive --hivevar day=$(date '+%Y-%m-%d' --date="-1 day") -f hivevariable.hql
三.MySQL全量(Datax-Web)
http://flink01:9527/index.html
账号:admin
密码:123456
1.创建MySQL数据源
2.创建Hive数据源
3.任务构建
先在Hive中创建好表
-- sudo -u hdfs hdfs dfs -rmr /yanxu-jin/datax/suv_complain_file/*
USE `whdb`;
DROP TABLE IF EXISTS `whdb`.`t_yyc_ts_info`;
CREATE EXTERNAL TABLE `t_yyc_ts_info`(
`com_id` string COMMENT 'complain投诉编号',
`com_agent_name` string COMMENT 'complain经销商名称 周口京彩润,北京鹏元,长沙脸谱,南阳京润, 昆明孝尊 ',
`com_type_id` string COMMENT 'complain投诉类型id',
`com_type_name` string COMMENT 'complain投诉类型名称 1001 产品质量 1002 售后投诉 1003 配件投诉 1004销售投诉 1005其它类投诉',
`com_car_type` string COMMENT 'complain投诉车型 BJ40L,BJ40 PLUS,B80C,B40皮卡,B40柴油版,B40V,B40L PLUS-柴油版,BJ80,BJ80汽油版,BJ40环塔冠军版,BJ40城市猎人版,BJ40PLUS域 ',
`com_info` string COMMENT 'complain投诉内容',
`com_time` string COMMENT 'complain投诉时间',
`com_dt_limit` string COMMENT 'complain最晚投诉处理期限 ',
`com_status` string COMMENT 'complain投诉处理状态,10待处理,11已处理',
`com_do_time` string COMMENT 'complain投诉处理时间',
`com_do` int COMMENT 'complain投诉处理周期 多少天'
) row format delimited fields terminated by '\u0001'
STORED AS TEXTFILE
LOCATION '/yanxu-jin/datax/suv_complain_file/';
4.任务管理中添加任务
5.运行任务并查看日志
sudo -u hdfs hdfs dfs -rmr /yanxu-jin/datax/suv_complain_file/*
四.Oracle全量(Datax-Web)
1.Hive建表
DROP TABLE IF EXISTS `whdb`.`t_user_reg`;
CREATE TABLE `whdb`.`t_user_reg` (
`cif_no` string COMMENT '客户号',
`cif_type` string COMMENT '客户类型:10消费客户,11分销商,12代销商,13内部客户',
`cif_type_name` string COMMENT '客户类型名称:10消费客户,11分销商,12代销商,13内部客户',
`cif_account` string COMMENT '账号',
`cus_tel` string COMMENT '客户手机号',
`active_flag` string COMMENT '激活标志:1激活,0未激活',
`cif_nickname` string COMMENT '昵称',
`reg_origin` string COMMENT '注册来源:0手机,1PC,2柜台',
`cif_origin` string COMMENT '客户来源',
`account_flag` string COMMENT '开户标记:0未开户,1开户',
`area_code` bigint COMMENT '行政区划',
`email` string COMMENT '客户邮箱',
`reg_time` timestamp COMMENT '注册时间',
`occ_time` timestamp COMMENT '时间戳'
) row format delimited fields terminated by '\u0001' STORED AS orc
LOCATION 'hdfs://ambari01.devinkim.com:8020/warehouse/tablespace/managed/hive/whdb.db/t_user_reg';
2.开发datax配置文件
{
"job": {
"setting": {
"speed": {
"channel": 3,
"byte": 1048576
},
"errorLimit": {
"record": 0,
"percentage": 0.02
}
},
"content": [
{
"reader": {
"name": "oraclereader",
"parameter": {
"username": "EnJGXrfLkVvuqUICQB3tGg==",
"password": "E8gaBXRn6CCJLyyE1uIA6w==",
"column": [
"\"CIF_NO\"",
"\"CIF_TYPE\"",
"\"CIF_TYPE_NAME\"",
"\"CIF_ACCOUNT\"",
"\"CUS_TEL\"",
"\"ACTIVE_FLAG\"",
"\"CIF_NICKNAME\"",
"\"REG_ORIGIN\"",
"\"CIF_ORIGIN\"",
"\"ACCOUNT_FLAG\"",
"\"AREA_CODE\"",
"\"EMAIL\"",
"\"REG_TIME\"",
"\"OCC_TIME\""
],
"splitPk": "",
"connection": [
{
"table": [
"T_USER_REG"
],
"jdbcUrl": [
" jdbc:oracle:thin:@10.90.21.26:1521:orcl"
]
}
]
}
},
"writer": {
"name": "hdfswriter",
"parameter": {
"defaultFS": "hdfs://ambari01.devinkim.com:8020",
"fileType": "orc",
"path": "/warehouse/tablespace/managed/hive/whdb.db/t_user_reg",
"fileName": "file_user_reg",
"writeMode": "append",
"fieldDelimiter": "\u0001",
"column": [
{
"name": "cif_no",
"type": "string"
},
{
"name": "cif_type",
"type": "string"
},
{
"name": "cif_type_name",
"type": "string"
},
{
"name": "cif_account",
"type": "string"
},
{
"name": "cus_tel",
"type": "string"
},
{
"name": "active_flag",
"type": "string"
},
{
"name": "cif_nickname",
"type": "string"
},
{
"name": "reg_origin",
"type": "string"
},
{
"name": "cif_origin",
"type": "string"
},
{
"name": "account_flag",
"type": "string"
},
{
"name": "area_code",
"type": "bigint"
},
{
"name": "email",
"type": "string"
},
{
"name": "reg_time",
"type": "timestamp"
},
{
"name": "occ_time",
"type": "timestamp"
}
]
}
}
}
]
}
}
3.手动加载数据
load data inpath '/warehouse/tablespace/managed/hive/whdb.db/t_user_reg/*' into table `whdb`.`t_user_reg`;
select count(*) from `whdb`.`t_user_reg`;
select * from `whdb`.`t_user_reg` limit 10;