实现原理
MysqlReader插件实现了从Mysql读取数据。简而言之,MysqlReader通过JDBC连接器连接到远程的Mysql数据库,并根据用户配置的信息生成查询SELECT SQL语句,然后发送到远程Mysql数据库,并将该SQL执行返回结果使用DataX自定义的数据类型拼装为抽象的数据集,并传递给下游Writer处理。
对于用户配置Table、Column、Where的信息,MysqlReader将其拼接为SQL语句发送到Mysql数据库;对于用户配置querySql信息,MysqlReader直接将其发送到Mysql数据库。
不同于其他关系型数据库,MysqlReader不支持FetchSize.
HdfsWriter提供向HDFS文件系统指定路径中写入TEXTFile文件和ORCFile文件,文件内容可与hive中表关联。
MysqlReader插件官方文档,有介绍各个参数的说明
https://github.com/alibaba/DataX/blob/master/mysqlreader/doc/mysqlreader.md
HdfsWriter插件官方文档,有介绍各个参数的说明
https://github.com/alibaba/DataX/blob/master/hdfswriter/doc/hdfswriter.md
开始开发
1 为方便hive查看数据,所以先准备建表的语句 (可选)
//数据保存为文本文件的格式(默认)
create external table IF NOT EXISTS e_board(
id string,
title string,
num string,
hide string
)
row format delimited
fields terminated by ','
collection items terminated by '-'
location '/user/oozie/oozie_test/dir1/e_board';
//数据保存为orc的格式
create external table IF NOT EXISTS e_board(
id string,
title string,
num string,
hide string
)
row format delimited
fields terminated by ','
collection items terminated by '-'
stored as orc
location '/user/oozie/oozie_test/dir1/e_board';
2 编写dataX的json文件,文件名 mysql_hdfs.json,演示mysql中一张表到hdfs上
{
"job": {
"content": [
{
"reader": {
"name": "mysqlreader",
"parameter": {
"column": ["id","title","num","hide"],
"connection": [
{
"jdbcUrl": ["jdbc:mysql://18.58.4.2:3306/linshi_1"],
"table": ["e_board"],
"splitPk":"id"
}
],
"password": "4560806",
"username": "root",
"where": ""
}
},
"writer": {
"name": "hdfswriter",
"parameter": {
"column": [
{"name":"id","type":"string"},
{"name":"title","type":"string"},
{"name":"num","type":"string"},
{"name":"hide","type":"string"}
],
"defaultFS": "hdfs://kncloud02:8020",
"path": "/user/oozie/oozie_test/dir1/e_board",
"fileName": "target_e_board",
"fileType": "text",
"fieldDelimiter": ",",
"compress": "gzip",
"writeMode": "append"
}
}
}
],
"setting": {
"speed": {
"channel": "5"
}
}
}
}
演示 读取mysql中一张表,保存到hdfs上,以orc的格式存存,snappy压缩
{
"job": {
"content": [
{
"reader": {
"name": "mysqlreader",
"parameter": {
"column":["EKORG","INFNR","LIFNR","ESOKZ"],
"connection": [
{
"jdbcUrl": ["jdbc:mysql://58.18.4.2:3306/kndb_standard_cost_multi_company"],
"table": ["purchase_record"]
}
],
"password": "65780806",
"username": "root",
"where": ""
}
},
"writer": {
"name": "hdfswriter",
"parameter": {
"column": [
{"name":"id","type":"string"},
{"name":"title","type":"string"},
{"name":"num","type":"string"},
{"name":"hide","type":"string"}
],
"defaultFS": "hdfs://kncloud02:8020",
"path": "/user/oozie/oozie_test/dir1/e_board",
"fileName": "target_e_board",
"fileType": "orc",
"fieldDelimiter": ",",
"compress": "SNAPPY",
"writeMode": "append"
}
}
}
],
"setting": {
"speed": {
"channel": "10"
}
}
}
}
演示通过自定义的sql语句读取mysql中的数据,到hdfs上
{
"job": {
"content": [
{
"reader": {
"name": "mysqlreader",
"parameter": {
"connection": [
{
"jdbcUrl": ["jdbc:mysql://8.58.4.2:3306/linshi_1"],
"querySql": ["SELECT b.c_id,SUBSTR(a.c_title,5),SUBSTR(b.c_title,8),a.c_addtime FROM article a INNER JOIN e_contect b ON a.l_id=b.c_parent2"]
}
],
"password": "65806",
"username": "root",
"where": ""
}
},
"writer": {
"name": "hdfswriter",
"parameter": {
"column": [
{"name":"id","type":"string"},
{"name":"title","type":"string"},
{"name":"num","type":"string"},
{"name":"hide","type":"string"}
],
"defaultFS": "hdfs://kncloud02:8020",
"path": "/user/oozie/oozie_test/dir1/e_board",
"fileName": "target_e_board",
"fileType": "text",
"fieldDelimiter": ",",
"compress": "gzip",
"writeMode": "append"
}
}
}
],
"setting": {
"speed": {
"channel": "1"
}
}
}
}
3 进入datax目录,运行datax命令,开始抽取
python bin/datax.py job/mysql_hdfs.json