简介
DataX 是阿里巴巴集团内被广泛使用的离线数据同步工具/平台,实现包括 MySQL、Oracle、SqlServer、Postgre、HDFS、Hive、ADS、HBase、TableStore(OTS)、MaxCompute(ODPS)、DRDS 等各种异构数据源之间高效的数据同步功能。
DataX本身作为数据同步框架,将不同数据源的同步抽象为从源头数据源读取数据的Reader插件,以及向目标端写入数据的Writer插件,理论上DataX框架可以支持任意数据源类型的数据同步工作。同时DataX插件体系作为一套生态系统, 每接入一套新数据源该新加入的数据源即可实现和现有的数据源互通。
DataX的安装
DataX不需要依赖其他服务,直接上传、解压、安装、配置环境变量即可
也可以直接在windows上解压
DataX的使用
stream2stream
编写配置文件stream2stream.json
# stream2stream.json { "job": { "content": [ { "reader": { "name": "streamreader", "parameter": { "sliceRecordCount": 10, "column": [ { "type": "long", "value": "10" }, { "type": "string", "value": "hello,你好,世界-DataX" } ] } }, "writer": { "name": "streamwriter", "parameter": { "encoding": "UTF-8", "print": true } } } ], "setting": { "speed": { "channel": 5 } } } }
执行同步任务
datax.py stream2stream.json
执行结果
mysql2mysql
需要新建student2数据库,并创建student表
编写配置文件mysql2mysql.json
{ "job": { "content": [ { "reader": { "name": "mysqlreader", "parameter": { "username": "root", "password": "123456", "column": [ "id", "name", "age", "gender", "clazz", "last_mod" ], "splitPk": "age", "connection": [ { "table": [ "student" ], "jdbcUrl": [ "jdbc:mysql://master:3306/student" ] } ] } }, "writer": { "name": "mysqlwriter", "parameter": { "writeMode": "insert", "username": "root", "password": "123456", "column": [ "id", "name", "age", "gender", "clazz", "last_mod" ], "preSql": [ "truncate student2" ], "connection": [ { "jdbcUrl": "jdbc:mysql://master:3306/student2?useUnicode=true&characterEncoding=utf8", "table": [ "student2" ] } ] } } } ], "setting": { "speed": { "channel": 6 } } } }
执行同步任务
datax.py mysql2mysql.json
mysql2hdfs
写hive跟hdfs时一样的
编写配置文件mysql2hdfs.json
{ "job": { "content": [ { "reader": { "name": "mysqlreader", "parameter": { "username": "root", "password": "123456", "column": [ "id", "name", "age", "gender", "clazz", "last_mod" ], "splitPk": "age", "connection": [ { "table": [ "student" ], "jdbcUrl": [ "jdbc:mysql://master:3306/student" ] } ] } }, "writer": { "name": "hdfswriter", "parameter": { "defaultFS": "hdfs://master:9000", "fileType": "text", "path": "/user/hive/warehouse/datax.db/students", "fileName": "student", "column": [ { "name": "id", "type": "bigint" }, { "name": "name", "type": "string" }, { "name": "age", "type": "INT" }, { "name": "gender", "type": "string" }, { "name": "clazz", "type": "string" }, { "name": "last_mod", "type": "string" } ], "writeMode": "append", "fieldDelimiter": "," } } } ], "setting": { "speed": { "channel": 6 } } } }
hbase2mysql
{ "job": { "content": [ { "reader": { "name": "hbase11xreader", "parameter": { "hbaseConfig": { "hbase.zookeeper.quorum": "master:2181" }, "table": "student", "encoding": "utf-8", "mode": "normal", "column": [ { "name": "rowkey", "type": "string" }, { "name": "cf1:name", "type": "string" }, { "name": "cf1:age", "type": "string" }, { "name": "cf1:gender", "type": "string" }, { "name": "cf1:clazz", "type": "string" } ], "range": { "startRowkey": "", "endRowkey": "", "isBinaryRowkey": false } } }, "writer": { "name": "mysqlwriter", "parameter": { "writeMode": "insert", "username": "root", "password": "123456", "column": [ "id", "name", "age", "gender", "clazz" ], "preSql": [ "truncate student2" ], "connection": [ { "jdbcUrl": "jdbc:mysql://master:3306/student2?useUnicode=true&characterEncoding=utf8", "table": [ "student2" ] } ] } } } ], "setting": { "speed": { "channel": 6 } } } }
mysql2hbase
mysql中的score表需将cource_id改为course_id,并将student_id、course_id设为主键,并将所有字段的类型改为int
hbase需先创建score表:create 'score','cf1'
{ "job": { "content": [ { "reader": { "name": "mysqlreader", "parameter": { "username": "root", "password": "123456", "column": [ "student_id", "course_id", "score" ], "splitPk": "course_id", "connection": [ { "table": [ "score" ], "jdbcUrl": [ "jdbc:mysql://master:3306/student" ] } ] } }, "writer": { "name": "hbase11xwriter", "parameter": { "hbaseConfig": { "hbase.zookeeper.quorum": "master:2181" }, "table": "score", "mode": "normal", "rowkeyColumn": [ { "index":0, "type":"string" }, { "index":-1, "type":"string", "value":"_" }, { "index":1, "type":"string" } ], "column": [ { "index":2, "name": "cf1:score", "type": "int" } ], "encoding": "utf-8" } } } ], "setting": { "speed": { "channel": 6 } } } }
mysql2Phoenix
在Phoenix中创建STUDENT表
CREATE TABLE IF NOT EXISTS STUDENT ( ID VARCHAR NOT NULL PRIMARY KEY, NAME VARCHAR, AGE BIGINT, GENDER VARCHAR , CLAZZ VARCHAR );
编写配置文件MySQLToPhoenix.json
{ "job": { "setting": { "speed": { "channel": 3 }, "errorLimit": { "record": 0, "percentage": 0.02 } }, "content": [ { "reader": { "name": "mysqlreader", "parameter": { "username": "root", "password": "123456", "column": [ "id", "name", "age", "gender", "clazz" ], "splitPk": "id", "connection": [ { "table": [ "student" ], "jdbcUrl": [ "jdbc:mysql://master:3306/student?useSSL=false" ] } ] } }, "writer": { "name": "hbase11xsqlwriter", "parameter": { "batchSize": "256", "column": [ "ID", "NAME", "AGE", "GENDER", "CLAZZ" ], "hbaseConfig": { "hbase.zookeeper.quorum": "master,node1,node2", "zookeeper.znode.parent": "/hbase" }, "nullMode": "skip", "table": "STUDENT" } } } ] } }
HDFSToHBase
将students.txt数据上传至HDFS的
/data/student1/
目录在HBase中创建datax表:
create 'datax','cf1'
{ "job": { "setting": { "speed": { "channel": 3 }, "errorLimit": { "record": 0, "percentage": 0.02 } }, "content": [ { "reader": { "name": "hdfsreader", "parameter": { "path": "/data/student1/", "defaultFS": "hdfs://master:9000", "column": [ { "index": 0, "type": "string" }, { "index": 1, "type": "string" }, { "index": 2, "type": "string" }, { "index": 3, "type": "string" }, { "index": 4, "type": "string" }, { "index": 5, "type": "string" } ], "fileType": "text", "encoding": "UTF-8", "fieldDelimiter": "," } }, "writer": { "name": "hbase11xwriter", "parameter": { "hbaseConfig": { "hbase.zookeeper.quorum": "master,node1,node2" }, "table": "datax", "mode": "normal", "rowkeyColumn": [ { "index": 0, "type": "string" }, { "index": -1, "type": "string", "value": "_" }, { "index": 1, "type": "string" } ], "column": [ { "index": 2, "name": "cf1:age", "type": "string" }, { "index": 3, "name": "cf1:gender", "type": "string" }, { "index": 4, "name": "cf1:clazz", "type": "string" }, { "index": 5, "name": "cf1:ts", "type": "string" } ], "versionColumn": { "index": 5 }, "encoding": "utf-8" } } } ] } }