python生成datax同步数据需要的json格式文件

#!/usr/bin/python
#-*- encoding:utf-8 -*-
import json
import os
import sys

import psycopg2
from clickhouse_driver import Client
from clickhouse_driver import connect

reload(sys)
sys.setdefaultencoding('utf-8')

"""
yum install postgresql-devel*
yum install python-psycopg2  # python35-psycopg2 for py3
pip install clickhouse_drivers

脚本文件 解决问题:
  1. 自定义表名,自定义环境 可以将PG表结构 同步到CK中
  2. 支持按不同环境, 自动创建 datax 同步数据需要的 json 文件
"""

# postgres 数据库连接信息
pg_dict = {
    'prod': {"database": "数据库名", "user": "用户", "password": "密码", "host": "主机IP", "port": "5432"},
    'dev':  {"database": "数据库名", "user": "用户", "password": "密码", "host": "主机IP", "port": "5432"}
}

# clickhouse 数据库连接信息
ck_dict = {
    'prod': {"database": "数据库", "user": "用户", "password": "密码", "host": "主机IP", "port": "8123"},
    'dev':  {"database": "数据库", "user": "用户", "password": "密码", "host": "主机IP", "port": "8123"}
}


class DataXHJL:
    jdbcurl_dict = {"pg": "", "ck": ""}  # JDBC URL 配置
    cur_pg, conn_pg = None, None
    cur_ck, conn_ck = None, None
    env_name = 'dev'
    table_name_pg = ""  # 需要操作的表名 postgres 的表名
    table_name_ck = ""  # clickhouse 表名
    datax_json_path = ""  # datax 同步数据 需要使用的 json 文件路径 /usr/local/datax/bin/nt_member.json

    def __init__(self, env_name, table_name_pg):
        """
        连接 postgres , clickhouse DB
        """
        self.env_name = env_name
        self.table_name_pg = table_name_pg
        self.table_name_ck = table_name_pg.replace("nt_", "ck_", 1)
        # jdbcurl 配置
        self.jdbcurl_dict.update({
            # "pg": "jdbc:postgresql://xxxx:5432/hh_test",
            # "ck": "jdbc:clickhouse://127.0.0.1:8123/hh_test",
            "pg": "jdbc:postgresql://%s:%s/%s" % (
                pg_dict.get(self.env_name, {}).get('host'),
                pg_dict.get(self.env_name, {}).get('port'),
                pg_dict.get(self.env_name, {}).get('database')
            ),
            "ck": "jdbc:clickhouse://%s:%s/%s" % (
                ck_dict.get(self.env_name, {}).get('host'),
                ck_dict.get(self.env_name, {}).get('port'),
                ck_dict.get(self.env_name, {}).get('database')
            ),
        })
        # https://www.cnblogs.com/hello-wei/p/9685998.html
        # self.cur_pg = pg.connect(dbname=pg_dict.get(self.env_name, {}).get('database'),
        #                     host=pg_dict.get(self.env_name, {}).get('database'),
        #                     user=pg_dict.get(self.env_name, {}).get('database'),
        #                     passwd=pg_dict.get(self.env_name, {}).get('database'))
        self.conn_pg = conn = psycopg2.connect(database=pg_dict.get(self.env_name, {}).get('database'),
                                               user=pg_dict.get(self.env_name, {}).get('user'),
                                               password=pg_dict.get(self.env_name, {}).get('password'),
                                               host=pg_dict.get(self.env_name, {}).get('host'),
                                               port=pg_dict.get(self.env_name, {}).get('port'))
        # 建立游标,用来执行数据库操作
        self.cur_pg = conn.cursor()
        # 执行SQL命令
        # self.cur_pg.execute("CREATE TABLE test_conn(id int, name text)")
        # self.cur_pg.execute("INSERT INTO test_conn values(1,'haha')")
        # self.conn_pg.commit()  # 提交SQL命令
        # self.cur_pg.close()  # 关闭游标
        # self.conn_pg.close()  # 关闭数据库连接

        # 账号:密码@主机名:端口号/数据库
        # conn = connect('clickhouse://default:123456@10.10.10.1:9000/hh_test')
        self.conn_ck = connect('clickhouse://default:%s@%s:9000/%s' % (
            ck_dict.get(self.env_name, {}).get('password'),
            ck_dict.get(self.env_name, {}).get('host'),
            ck_dict.get(self.env_name, {}).get('database')
        ))
        self.cur_ck = self.conn_ck.cursor()
        # sql = 'show databases'
        # ans = self.cur_ck.execute(sql)
        # print(ans)

    def close(self):
        self.cur_pg.close()  # 关闭游标
        self.conn_pg.close()  # 关闭数据库连接
        self.cur_ck.close()  # 关闭游标
        self.conn_ck.close()  # 关闭数据库连接

    def get_pg_table_columns(self):
        """
        将 postgres 表中字段 使用 "", 连接在一起
        """
        columns = []
        hql = "SELECT column_name from information_schema.columns where table_name = '%s' and table_schema = 'public' order by ordinal_position ASC" % self.table_name_pg
        self.cur_pg.execute(hql)
        rows = self.cur_pg.fetchall()
        for row in rows:
            # TODO: 字段名称是大写的, datax 执行同步时 报错(可能是datax的BUG); 这些字段不影响业务, 做跳过处理
            # nt_sale_shop.SMH_account   nt_product.SAP_description,SAP_code,SAP_description2,SAP_code2
            if row[0] in ["SMH_account", "SAP_description", "SAP_code", "SAP_description2", "SAP_code2"]:
                continue
            columns.append(row[0])
        return columns

    def get_pg_table_columns_comment(self):
        """
        将 postgres 表中字段 使用 "", 连接在一起
        """
        column_dict = {}
        self.cur_pg.execute("""
            SELECT 
                a.attname as "字段名",
                col_description(a.attrelid,a.attnum) as "注释",
                concat_ws('',t.typname,SUBSTRING(format_type(a.atttypid,a.atttypmod) from '\(.*\)')) as "字段类型"
            FROM 
                pg_class as c,
                pg_attribute as a, 
                pg_type as t
            WHERE 
                c.relname = '%s' 
                and a.atttypid = t.oid 
                and a.attrelid = c.oid 
                and a.attnum>0;
        """ % self.table_name_pg)
        rows = self.cur_pg.fetchall()
        for row in rows:
            column_dict[row[0]] = {"comment": row[1] or "", "type": row[2] or ""}
        return column_dict

    def create_ck_table(self):
        # 1. 生成clickhouse创建表的语句
        self.cur_pg.execute("""
            select
            concat(
                'create table ',
                '%s',
                '(',
                'ts_ms Int64 DEFAULT now(), insert_time DateTime DEFAULT now(), ',
                string_agg(
                        concat(
                                column_name,
                                ' ',
                                CASE when is_nullable = 'YES' and column_name != 'create_date' THEN 'Nullable(' END,
                                CASE
                                    when data_type = 'integer'  then 'Int32'
                                    when data_type = 'smallint' then 'Int16'
                                    when data_type = 'bigint'   then 'Int64'
                                    when data_type = 'numeric'  then 'Float64'
                                    when data_type = 'double precision'  then 'Float64'
                                    when data_type = 'character varying' then 'String'
                                    when data_type = 'character' then 'String'
                                    when data_type = 'text'      then 'String'
                                    when data_type = 'boolean'   then 'UInt8'
                                    when data_type = 'date'      then 'Date'
                                    when data_type = 'timestamp without time zone' then 'Datetime'
                                    when data_type = 'time without time zone'      then 'String'
                                    else concat('###################', data_type)
                                    END,
                                CASE when is_nullable = 'YES' and column_name != 'create_date' THEN ')' END
                            ),
                        ', ' order by ordinal_position
                ),
                ') ',
                'ENGINE = ReplacingMergeTree() '
                'PARTITION BY toYYYYMM(create_date) '
                'ORDER BY [id] '
            ) as table_ddl
        from information_schema.columns
        where true
          and table_name = '%s'
          and table_schema = 'public'
        group by table_name ;
        """ % (self.table_name_ck, self.table_name_pg))
        rows = self.cur_pg.fetchone()
        ck_table_create_sql = rows[0]
        print("1. 生成clickhouse创建表的语句 -- ok")
        # print(rows[0])
        # 2. clickhouse中删除已存在的表
        self.cur_ck.execute("DROP TABLE IF EXISTS %s" % self.table_name_ck)
        print("2. clickhouse中删除已存在的表 -- ok")
        # 3. clickhouse中创建表
        self.cur_ck.execute(ck_table_create_sql)
        print("3. clickhouse中创建表 -- ok")
        # 4. clickhouse中修改字段备注
        column_dict = self.get_pg_table_columns_comment()
        for column_name, _item in column_dict.items():
            self.cur_ck.execute("ALTER TABLE `%s` COMMENT COLUMN `%s` '%s';" % (self.table_name_ck, column_name, _item.get('comment', '')))
        print("4. clickhouse中修改字段备注 -- ok")

    def generate_datax_json_file(self, querySql=None):
        """
        @param querySql 没有值 表示 根据PG原表 做为数据源
                        有值   表示 根据PG查询的SQL 作为数据源
        """
        # 1. 生成 datax 需要的json文件
        json_dict = {
                    "job": {
                        "content": [{
                            "reader": {
                                "name": "postgresqlreader",
                                "parameter": {
                                    "column": ["id", "create_uid", "create_date", "write_date", "write_uid", "account", "account_type", "wx_app_id", "applet_openid", "member_id"],
                                    "connection": [{
                                        "jdbcUrl": ["jdbc:postgresql://10.200.100.1:5432/hh_test"],
                                        "table": ["nt_member_account"]
                                    }],
                                    "username": pg_dict.get(self.env_name, {}).get('user'),
                                    "password": pg_dict.get(self.env_name, {}).get('password')
                                }
                            },
                            "writer": {
                                "name": "clickhousewriter",
                                "parameter": {
                                    "username": ck_dict.get(self.env_name, {}).get("user"),
                                    "password": ck_dict.get(self.env_name, {}).get("password"),
                                    #"column": ["id", "create_uid", "create_date", "write_date", "write_uid", "account", "account_type", "wx_app_id", "applet_openid", "member_id"],
                                    "connection": [{
                                        "jdbcUrl": "jdbc:clickhouse://127.0.0.1:8123/hh_test",
                                        "table": ["ck_member_account"]
                                    }],
                                    # "preSql": [
                                    #     "TRUNCATE TABLE ck_member_account",
                                    #     "drop view if EXISTS `view_ck_member_account`",
                                    #     "create view `view_ck_member_account` as select * from (select row_number() OVER www as row, * from ck_member_account WINDOW www as (partition by id order by insert_time desc) ) where row = 1"
                                    # ]
                                }
                            }
                        }],
                        "setting": {
                            "speed": {
                                "channel": 1
                            }
                        }
                    }
                }
        if querySql:
            # postgres 配置
            json_dict['job']['content'][0]['reader']['parameter'].update({
                "connection": [{
                    "jdbcUrl": [self.jdbcurl_dict.get('pg')],
                    "querySql": [querySql]
                }]
            })
            # clickhouse 配置
            json_dict['job']['content'][0]['writer']['parameter'].update({
                "column": ["*"],
                "connection": [{
                    "jdbcUrl": self.jdbcurl_dict.get('ck'),
                    "table": [self.table_name_ck]
                }],
                "preSql": [
                    "TRUNCATE TABLE %s" % self.table_name_ck,
                ]
            })
        else:
            columns = self.get_pg_table_columns()
            # print(columns)
            # 替换 JSON 里面的配置
            # postgres 配置
            json_dict['job']['content'][0]['reader']['parameter'].update({
                "column": columns,
                "connection": [{
                    "jdbcUrl": [self.jdbcurl_dict.get('pg')],
                    "table": [self.table_name_pg]
                }]
            })
            # clickhouse 配置
            json_dict['job']['content'][0]['writer']['parameter'].update({
                        "column": columns,
                        "connection": [{
                            "jdbcUrl": self.jdbcurl_dict.get('ck'),
                            "table": [self.table_name_ck]
                        }],
                        "preSql": [
                            "TRUNCATE TABLE %s" % self.table_name_ck,
                            "drop view if EXISTS `view_%s`" % self.table_name_ck,
                            "create view `view_%s` as select * from (select row_number() OVER www as row, * from %s WINDOW www as (partition by id order by insert_time desc) ) where row = 1" % (self.table_name_ck, self.table_name_ck)
                        ]
            })

        self.datax_json_path = "./%s_%s.json" % (self.table_name_ck, self.env_name)
        self.save_file(path=self.datax_json_path, item=json_dict)
        print("1. 生成 datax 需要的json文件: %s -- ok " % self.datax_json_path)

    def datax_copy_data_to_ck(self):
        """
        # 执行 datax 复制数据
        # /usr/local/datax/bin/datax.py
        """
        print("datax执行数据全量同步操作 ...")
        if not self.datax_json_path:
            self.datax_json_path = "./%s_%s.json" % (self.table_name_ck, self.env_name)
        os.system("python /usr/local/datax/bin/datax.py %s" % self.datax_json_path)

    def save_file(self, path, item):
        # 先将字典对象转化为可写入文本的字符串
        item = json.dumps(item, indent=4)
        try:
            if os.path.exists(path):
                os.remove(path)
            with open(path, "w") as f:
                f.write(item)
                print("创建 datax 同步数据 需要的json文件  -- ok")
        except Exception as e:
            print("write error==>", e)


if __name__ == '__main__':
    help_doc = """
    使用方法:
    1. 将PG表结构复制到CK中             python datax_hh.py create_ck_table <环境> <PG表名>
    2. 生成datax需要的json文件          python datax_hh.py generate_datax_json_file <环境> <PG表名> 
    3. 调用datax.py将全量数据同步到CK中  python datax_hh.py datax_copy_data_to_ck <环境> <PG表名> 
    4. 执行以上合并操作                 python datax_hh.py all  <环境> <PG表名>
    5. 导入会员分组涉及的4张表           python datax_hh.py ck_member_group <环境> <*表名看以下说明*>
    
    示例: 将测试环境pg指定表导入ck测试库中:  python datax_hh.py all dev  nt_payment_type
    示例: 将正式环境pg指定表导入ck正式库中:  python datax_hh.py all prod nt_payment_type
    
    """
    """
    # 会员分组: 使用到的4张大宽表
    python datax_hjl.py ck_member_group prod ck_member 
    python datax_hjl.py all dev nt_sale_order
    python datax_hjl.py all dev nt_sale_order_line
    """
    if "help" in sys.argv or "-h" in sys.argv or len(sys.argv) == 1:
        print(help_doc)
        sys.exit()

    type, env_name, table_name = sys.argv[-3], sys.argv[-2], sys.argv[-1]
    if env_name not in ['dev', 'prod']:
        print("指定的环境错误 可选值  dev , prod")
        sys.exit()

    if table_name == "nt_member":
        print("会员表在clickhouse中是定制过的大宽表, 不能这么处理")
        sys.exit()

    # 操作
    cls = DataXHJL(env_name=env_name, table_name_pg=table_name)
    # cls.env_name   = env_name  # 'dev'
    # cls.table_name_pg = table_name_pg  # 'nt_payment_type'

    if "create_ck_table" in sys.argv:
        cls.create_ck_table()
    elif "generate_datax_json_file" in sys.argv:
        cls.generate_datax_json_file()
    elif "datax_copy_data_to_ck" in sys.argv:
        cls.datax_copy_data_to_ck()
    elif "all" in sys.argv:
        cls.create_ck_table()
        cls.generate_datax_json_file()
        cls.datax_copy_data_to_ck()
    # cls.create_ck_table()
    # cls.generate_datax_json_file()
    # cls.datax_copy_data_to_ck()
    cls.close()

以下是一个简单的示例,演示如何使用DataX实现多表数据同步到一个JSON文件中。 假设我们有两个表,一个是用户信息表(user),一个是订单信息表(order),它们的结构如下: user表: | id | name | age | | ---- | ---- | ---- | | 1 | Tom | 20 | | 2 | Jack | 22 | | 3 | Mary | 21 | order表: | id | user_id | product | amount | | ---- | ------- | ------- | ------ | | 1 | 1 | iPhone | 1 | | 2 | 1 | iPad | 2 | | 3 | 2 | MacBook | 1 | 我们需要将这两个表的数据同步到一个JSON文件中,格式如下: ``` { "users": [ { "id": 1, "name": "Tom", "age": 20, "orders": [ { "id": 1, "product": "iPhone", "amount": 1 }, { "id": 2, "product": "iPad", "amount": 2 } ] }, { "id": 2, "name": "Jack", "age": 22, "orders": [ { "id": 3, "product": "MacBook", "amount": 1 } ] }, { "id": 3, "name": "Mary", "age": 21, "orders": [] } ] } ``` 其中,每个用户包含其基本信息和订单信息,如果用户没有订单,则其订单列表为空。 为了实现这个需求,我们可以使用DataX中的两个插件:MySQL Reader插件和JSON Writer插件。 首先,我们需要DataX的配置文件中配置MySQL Reader插件,使用SQL语句从数据库中读取数据。配置文件示例如下: ``` { "job": { "content": [ { "reader": { "name": "mysqlreader", "parameter": { "connection": [ { "jdbcUrl": "jdbc:mysql://127.0.0.1:3306/test", "table": [ "user", "order" ], "username": "root", "password": "root" } ], "column": [ "user.id", "user.name", "user.age", "order.id", "order.product", "order.amount" ], "where": "", "splitPk": "", "encoding": "UTF-8" } }, "writer": null } ], "setting": { "speed": { "channel": 1 } } } } ``` 在该配置文件中,我们使用了MySQL Reader插件,从MySQL数据库中读取了user和order表的数据。为了方便后续处理,我们将两个表的数据列合并到了一起。 接下来,我们需要在配置文件中配置JSON Writer插件,将数据写入到JSON文件中。配置文件示例如下: ``` { "job": { "content": [ { "reader": { "name": "mysqlreader", "parameter": { "connection": [ { "jdbcUrl": "jdbc:mysql://127.0.0.1:3306/test", "table": [ "user", "order" ], "username": "root", "password": "root" } ], "column": [ "user.id", "user.name", "user.age", "order.id", "order.product", "order.amount" ], "where": "", "splitPk": "", "encoding": "UTF-8" } }, "writer": { "name": "jsonwriter", "parameter": { "path": "/data/users.json", "fileType": "NORMAL", "compress": "NONE", "encoding": "UTF-8", "dateFormat": "yyyy-MM-dd HH:mm:ss", "writeMode": "APPEND", "column": [ { "name": "id", "type": "LONG" }, { "name": "name", "type": "STRING" }, { "name": "age", "type": "LONG" }, { "name": "orders", "type": "ARRAY", "childColumn": [ { "name": "id", "type": "LONG" }, { "name": "product", "type": "STRING" }, { "name": "amount", "type": "LONG" } ] } ] } } } ], "setting": { "speed": { "channel": 1 } } } } ``` 在该配置文件中,我们使用了JSON Writer插件,将数据写入到了一个JSON文件中。我们将用户信息的id、name和age作为一级字段,将订单信息的id、product和amount作为二级字段,使用数组形式存储在orders字段中。 最后,我们使用DataX启动任务,即可将数据从MySQL数据库中读取,并写入到一个JSON文件中。 以上示例仅供参考,实际使用中需要根据具体需求进行调整。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

QMQ2021

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值