自动生成json脚本:
通过调用以下python脚本可以快速的生成datax的json脚本。
在~./bin目录下创建脚本文件
[cxd@hadoop102 ~]$ vim datax_json_config.py
在文件中输入以下内容
#使用pyhton写的脚本文件:用于生成datax的json配置文件。 # coding=utf-8 import json import getopt import os import sys import MySQLdb #MySQL相关配置,需根据实际情况作出修改 mysql_host = "hadoop102" mysql_port = "3306" mysql_user = "datax" mysql_passwd = "5Cp*!FXj1+4076=GSA" #HDFS NameNode相关配置,需根据实际情况作出修改 hdfs_nn_host = "hadoop102 hdfs_nn_port = "8020" #生成配置文件的目标路径,可根据实际情况作出修改 output_path = "/opt/module/datax/job/import #获取mysql连接 def get_connection(): return MySQLdb.connect(host=mysql_host, port=int(mysql_port), user=mysql_user, passwd=mysql_passwd) #获取表格的元数据 包含列名和数据类型 def get_mysql_meta(database, table): connection = get_connection() cursor = connection.cursor() sql = "SELECT COLUMN_NAME,DATA_TYPE from information_schema.COLUMNS WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s ORDER BY ORDINAL_POSITION" cursor.execute(sql, [database, table]) fetchall = cursor.fetchall() cursor.close() connection.close() return fetchall #获取mysql表的列名 def get_mysql_columns(database, table): return map(lambda x: x[0], get_mysql_meta(database, table)) #将获取的元数据中mysql的数据类型转换为hive的数据类型 写入到hdfswriter中 def get_hive_columns(database, table): def type_mapping(mysql_type): mappings = { "bigint": "bigint", "int": "bigint", "smallint": "bigint", "tinyint": "bigint", "decimal": "string", "double": "double", "float": "float", "binary": "string", "char": "string", "varchar": "string", "datetime": "string", "time": "string", "timestamp": "string", "date": "string", "text": "string" } return mappings[mysql_type] meta = get_mysql_meta(database, table) return map(lambda x: {"name": x[0], "type": type_mapping(x[1].lower())}, meta) #生成json文件 def generate_json(source_database, source_table): job = { #配置channel限速 "core": { "transport": { "channel": { "speed": { "byte": 1800000 } } } }, "job": { "setting": { "speed": { "byte": 5242880 }, "errorLimit": { "record": 0, "percentage": 0.02 } }, "content": [{ "reader": { "name": "mysqlreader", "parameter": { "username": mysql_user, "password": mysql_passwd, "column": get_mysql_columns(source_database, source_table), "splitPk": "", "connection": [{ "table": [source_table], "jdbcUrl": ["jdbc:mysql://" + mysql_host + ":" + mysql_port + "/" + source_database] }] } }, "writer": { "name": "hdfswriter", "parameter": { "defaultFS": "hdfs://" + hdfs_nn_host + ":" + hdfs_nn_port, "fileType": "text", "path": "${targetdir}", "fileName": source_table, "column": get_hive_columns(source_database, source_table), "writeMode": "append", "fieldDelimiter": "\t", "compress": "gzip" } } }] } } if not os.path.exists(output_path): os.makedirs(output_path) with open(os.path.join(output_path, ".".join([source_database, source_table, "json"])), "w") as f: json.dump(job, f) def main(args): source_database = "" source_table = "" options, arguments = getopt.getopt(args, '-d:-t:', ['sourcedb=', 'sourcetbl=']) for opt_name, opt_value in options: if opt_name in ('-d', '--sourcedb'): source_database = opt_value if opt_name in ('-t', '--sourcetbl'): source_table = opt_value generate_json(source_database, source_table) if __name__ == '__main__': main(sys.argv[1:])
保存后,修改脚本权限
[cxd@hadoop102 ~]$ chmod 777 datax_json_config.py
脚本调用,生成json脚本
[cxd@hadoop102 ~]$ python gen_import_config.py -d db -t table
==注意:==db为数据库名,table则为需要查询的表名。
我们在调用脚本时,只需要给第一第二个变量赋值即可。
但是由于如果我们要生产多个json脚本时,建议在编写一个shell脚本将此python脚本封装一下。
参考尚硅谷官网文档。