python生成datax同步数据需要的json格式文件

最新推荐文章于 2024-08-10 13:32:35 发布
QMQ2021
最新推荐文章于 2024-08-10 13:32:35 发布
阅读量2.5k
点赞数
文章标签： python json 数据库
本文链接：https://blog.csdn.net/qq_25131333/article/details/122178922
版权
#!/usr/bin/python
#-*- encoding:utf-8 -*-
import json
import os
import sys

import psycopg2
from clickhouse_driver import Client
from clickhouse_driver import connect

reload(sys)
sys.setdefaultencoding('utf-8')

"""
yum install postgresql-devel*
yum install python-psycopg2  # python35-psycopg2 for py3
pip install clickhouse_drivers

脚本文件 解决问题:
  1. 自定义表名,自定义环境 可以将PG表结构 同步到CK中
  2. 支持按不同环境, 自动创建 datax 同步数据需要的 json 文件
"""

# postgres 数据库连接信息
pg_dict = {
    'prod': {"database": "数据库名", "user": "用户", "password": "密码", "host": "主机IP", "port": "5432"},
    'dev':  {"database": "数据库名", "user": "用户", "password": "密码", "host": "主机IP", "port": "5432"}
}

# clickhouse 数据库连接信息
ck_dict = {
    'prod': {"database": "数据库", "user": "用户", "password": "密码", "host": "主机IP", "port": "8123"},
    'dev':  {"database": "数据库", "user": "用户", "password": "密码", "host": "主机IP", "port": "8123"}
}


class DataXHJL:
    jdbcurl_dict = {"pg": "", "ck": ""}  # JDBC URL 配置
    cur_pg, conn_pg = None, None
    cur_ck, conn_ck = None, None
    env_name = 'dev'
    table_name_pg = ""  # 需要操作的表名 postgres 的表名
    table_name_ck = ""  # clickhouse 表名
    datax_json_path = ""  # datax 同步数据 需要使用的 json 文件路径 /usr/local/datax/bin/nt_member.json

    def __init__(self, env_name, table_name_pg):
        """
        连接 postgres , clickhouse DB
        """
        self.env_name = env_name
        self.table_name_pg = table_name_pg
        self.table_name_ck = table_name_pg.replace("nt_", "ck_", 1)
        # jdbcurl 配置
        self.jdbcurl_dict.update({
            # "pg": "jdbc:postgresql://xxxx:5432/hh_test",
            # "ck": "jdbc:clickhouse://127.0.0.1:8123/hh_test",
            "pg": "jdbc:postgresql://%s:%s/%s" % (
                pg_dict.get(self.env_name, {}).get('host'),
                pg_dict.get(self.env_name, {}).get('port'),
                pg_dict.get(self.env_name, {}).get('database')
            ),
            "ck": "jdbc:clickhouse://%s:%s/%s" % (
                ck_dict.get(self.env_name, {}).get('host'),
                ck_dict.get(self.env_name, {}).get('port'),
                ck_dict.get(self.env_name, {}).get('database')
            ),
        })
        # https://www.cnblogs.com/hello-wei/p/9685998.html
        # self.cur_pg = pg.connect(dbname=pg_dict.get(self.env_name, {}).get('database'),
        #                     host=pg_dict.get(self.env_name, {}).get('database'),
        #                     user=pg_dict.get(self.env_name, {}).get('database'),
        #                     passwd=pg_dict.get(self.env_name, {}).get('database'))
        self.conn_pg = conn = psycopg2.connect(database=pg_dict.get(self.env_name, {}).get('database'),
                                               user=pg_dict.get(self.env_name, {}).get('user'),
                                               password=pg_dict.get(self.env_name, {}).get('password'),
                                               host=pg_dict.get(self.env_name, {}).get('host'),
                                               port=pg_dict.get(self.env_name, {}).get('port'))
        # 建立游标，用来执行数据库操作
        self.cur_pg = conn.cursor()
        # 执行SQL命令
        # self.cur_pg.execute("CREATE TABLE test_conn(id int, name text)")
        # self.cur_pg.execute("INSERT INTO test_conn values(1,'haha')")
        # self.conn_pg.commit()  # 提交SQL命令
        # self.cur_pg.close()  # 关闭游标
        # self.conn_pg.close()  # 关闭数据库连接

        # 账号:密码@主机名:端口号/数据库
        # conn = connect('clickhouse://default:123456@10.10.10.1:9000/hh_test')
        self.conn_ck = connect('clickhouse://default:%s@%s:9000/%s' % (
            ck_dict.get(self.env_name, {}).get('password'),
            ck_dict.get(self.env_name, {}).get('host'),
            ck_dict.get(self.env_name, {}).get('database')
        ))
        self.cur_ck = self.conn_ck.cursor()
        # sql = 'show databases'
        # ans = self.cur_ck.execute(sql)
        # print(ans)

    def close(self):
        self.cur_pg.close()  # 关闭游标
        self.conn_pg.close()  # 关闭数据库连接
        self.cur_ck.close()  # 关闭游标
        self.conn_ck.close()  # 关闭数据库连接

    def get_pg_table_columns(self):
        """
        将 postgres 表中字段 使用 "", 连接在一起
        """
        columns = []
        hql = "SELECT column_name from information_schema.columns where table_name = '%s' and table_schema = 'public' order by ordinal_position ASC" % self.table_name_pg
        self.cur_pg.execute(hql)
        rows = self.cur_pg.fetchall()
        for row in rows:
            # TODO: 字段名称是大写的, datax 执行同步时 报错(可能是datax的BUG); 这些字段不影响业务, 做跳过处理
            # nt_sale_shop.SMH_account   nt_product.SAP_description,SAP_code,SAP_description2,SAP_code2
            if row[0] in ["SMH_account", "SAP_description", "SAP_code", "SAP_description2", "SAP_code2"]:
                continue
            columns.append(row[0])
        return columns

    def get_pg_table_columns_comment(self):
        """
        将 postgres 表中字段 使用 "", 连接在一起
        """
        column_dict = {}
        self.cur_pg.execute("""
            SELECT 
                a.attname as "字段名",
                col_description(a.attrelid,a.attnum) as "注释",
                concat_ws('',t.typname,SUBSTRING(format_type(a.atttypid,a.atttypmod) from '\(.*\)')) as "字段类型"
            FROM 
                pg_class as c,
                pg_attribute as a, 
                pg_type as t
            WHERE 
                c.relname = '%s' 
                and a.atttypid = t.oid 
                and a.attrelid = c.oid 
                and a.attnum>0;
        """ % self.table_name_pg)
        rows = self.cur_pg.fetchall()
        for row in rows:
            column_dict[row[0]] = {"comment": row[1] or "", "type": row[2] or ""}
        return column_dict

    def create_ck_table(self):
        # 1. 生成clickhouse创建表的语句
        self.cur_pg.execute("""
            select
            concat(
                'create table ',
                '%s',
                '(',
                'ts_ms Int64 DEFAULT now(), insert_time DateTime DEFAULT now(), ',
                string_agg(
                        concat(
                                column_name,
                                ' ',
                                CASE when is_nullable = 'YES' and column_name != 'create_date' THEN 'Nullable(' END,
                                CASE
                                    when data_type = 'integer'  then 'Int32'
                                    when data_type = 'smallint' then 'Int16'
                                    when data_type = 'bigint'   then 'Int64'
                                    when data_type = 'numeric'  then 'Float64'
                                    when data_type = 'double precision'  then 'Float64'
                                    when data_type = 'character varying' then 'String'
                                    when data_type = 'character' then 'String'
                                    when data_type = 'text'      then 'String'
                                    when data_type = 'boolean'   then 'UInt8'
                                    when data_type = 'date'      then 'Date'
                                    when data_type = 'timestamp without time zone' then 'Datetime'
                                    when data_type = 'time without time zone'      then 'String'
                                    else concat('###################', data_type)
                                    END,
                                CASE when is_nullable = 'YES' and column_name != 'create_date' THEN ')' END
                            ),
                        ', ' order by ordinal_position
                ),
                ') ',
                'ENGINE = ReplacingMergeTree() '
                'PARTITION BY toYYYYMM(create_date) '
                'ORDER BY [id] '
            ) as table_ddl
        from information_schema.columns
        where true
          and table_name = '%s'
          and table_schema = 'public'
        group by table_name ;
        """ % (self.table_name_ck, self.table_name_pg))
        rows = self.cur_pg.fetchone()
        ck_table_create_sql = rows[0]
        print("1. 生成clickhouse创建表的语句 -- ok")
        # print(rows[0])
        # 2. clickhouse中删除已存在的表
        self.cur_ck.execute("DROP TABLE IF EXISTS %s" % self.table_name_ck)
        print("2. clickhouse中删除已存在的表 -- ok")
        # 3. clickhouse中创建表
        self.cur_ck.execute(ck_table_create_sql)
        print("3. clickhouse中创建表 -- ok")
        # 4. clickhouse中修改字段备注
        column_dict = self.get_pg_table_columns_comment()
        for column_name, _item in column_dict.items():
            self.cur_ck.execute("ALTER TABLE `%s` COMMENT COLUMN `%s` '%s';" % (self.table_name_ck, column_name, _item.get('comment', '')))
        print("4. clickhouse中修改字段备注 -- ok")

    def generate_datax_json_file(self, querySql=None):
        """
        @param querySql 没有值 表示 根据PG原表 做为数据源
                        有值   表示 根据PG查询的SQL 作为数据源
        """
        # 1. 生成 datax 需要的json文件
        json_dict = {
                    "job": {
                        "content": [{
                            "reader": {
                                "name": "postgresqlreader",
                                "parameter": {
                                    "column": ["id", "create_uid", "create_date", "write_date", "write_uid", "account", "account_type", "wx_app_id", "applet_openid", "member_id"],
                                    "connection": [{
                                        "jdbcUrl": ["jdbc:postgresql://10.200.100.1:5432/hh_test"],
                                        "table": ["nt_member_account"]
                                    }],
                                    "username": pg_dict.get(self.env_name, {}).get('user'),
                                    "password": pg_dict.get(self.env_name, {}).get('password')
                                }
                            },
                            "writer": {
                                "name": "clickhousewriter",
                                "parameter": {
                                    "username": ck_dict.get(self.env_name, {}).get("user"),
                                    "password": ck_dict.get(self.env_name, {}).get("password"),
                                    #"column": ["id", "create_uid", "create_date", "write_date", "write_uid", "account", "account_type", "wx_app_id", "applet_openid", "member_id"],
                                    "connection": [{
                                        "jdbcUrl": "jdbc:clickhouse://127.0.0.1:8123/hh_test",
                                        "table": ["ck_member_account"]
                                    }],
                                    # "preSql": [
                                    #     "TRUNCATE TABLE ck_member_account",
                                    #     "drop view if EXISTS `view_ck_member_account`",
                                    #     "create view `view_ck_member_account` as select * from (select row_number() OVER www as row, * from ck_member_account WINDOW www as (partition by id order by insert_time desc) ) where row = 1"
                                    # ]
                                }
                            }
                        }],
                        "setting": {
                            "speed": {
                                "channel": 1
                            }
                        }
                    }
                }
        if querySql:
            # postgres 配置
            json_dict['job']['content'][0]['reader']['parameter'].update({
                "connection": [{
                    "jdbcUrl": [self.jdbcurl_dict.get('pg')],
                    "querySql": [querySql]
                }]
            })
            # clickhouse 配置
            json_dict['job']['content'][0]['writer']['parameter'].update({
                "column": ["*"],
                "connection": [{
                    "jdbcUrl": self.jdbcurl_dict.get('ck'),
                    "table": [self.table_name_ck]
                }],
                "preSql": [
                    "TRUNCATE TABLE %s" % self.table_name_ck,
                ]
            })
        else:
            columns = self.get_pg_table_columns()
            # print(columns)
            # 替换 JSON 里面的配置
            # postgres 配置
            json_dict['job']['content'][0]['reader']['parameter'].update({
                "column": columns,
                "connection": [{
                    "jdbcUrl": [self.jdbcurl_dict.get('pg')],
                    "table": [self.table_name_pg]
                }]
            })
            # clickhouse 配置
            json_dict['job']['content'][0]['writer']['parameter'].update({
                        "column": columns,
                        "connection": [{
                            "jdbcUrl": self.jdbcurl_dict.get('ck'),
                            "table": [self.table_name_ck]
                        }],
                        "preSql": [
                            "TRUNCATE TABLE %s" % self.table_name_ck,
                            "drop view if EXISTS `view_%s`" % self.table_name_ck,
                            "create view `view_%s` as select * from (select row_number() OVER www as row, * from %s WINDOW www as (partition by id order by insert_time desc) ) where row = 1" % (self.table_name_ck, self.table_name_ck)
                        ]
            })

        self.datax_json_path = "./%s_%s.json" % (self.table_name_ck, self.env_name)
        self.save_file(path=self.datax_json_path, item=json_dict)
        print("1. 生成 datax 需要的json文件: %s -- ok " % self.datax_json_path)

    def datax_copy_data_to_ck(self):
        """
        # 执行 datax 复制数据
        # /usr/local/datax/bin/datax.py
        """
        print("datax执行数据全量同步操作 ...")
        if not self.datax_json_path:
            self.datax_json_path = "./%s_%s.json" % (self.table_name_ck, self.env_name)
        os.system("python /usr/local/datax/bin/datax.py %s" % self.datax_json_path)

    def save_file(self, path, item):
        # 先将字典对象转化为可写入文本的字符串
        item = json.dumps(item, indent=4)
        try:
            if os.path.exists(path):
                os.remove(path)
            with open(path, "w") as f:
                f.write(item)
                print("创建 datax 同步数据 需要的json文件  -- ok")
        except Exception as e:
            print("write error==>", e)


if __name__ == '__main__':
    help_doc = """
    使用方法:
    1. 将PG表结构复制到CK中             python datax_hh.py create_ck_table <环境> <PG表名>
    2. 生成datax需要的json文件          python datax_hh.py generate_datax_json_file <环境> <PG表名> 
    3. 调用datax.py将全量数据同步到CK中  python datax_hh.py datax_copy_data_to_ck <环境> <PG表名> 
    4. 执行以上合并操作                 python datax_hh.py all  <环境> <PG表名>
    5. 导入会员分组涉及的4张表           python datax_hh.py ck_member_group <环境> <*表名看以下说明*>
    
    示例: 将测试环境pg指定表导入ck测试库中:  python datax_hh.py all dev  nt_payment_type
    示例: 将正式环境pg指定表导入ck正式库中:  python datax_hh.py all prod nt_payment_type
    
    """
    """
    # 会员分组: 使用到的4张大宽表
    python datax_hjl.py ck_member_group prod ck_member 
    python datax_hjl.py all dev nt_sale_order
    python datax_hjl.py all dev nt_sale_order_line
    """
    if "help" in sys.argv or "-h" in sys.argv or len(sys.argv) == 1:
        print(help_doc)
        sys.exit()

    type, env_name, table_name = sys.argv[-3], sys.argv[-2], sys.argv[-1]
    if env_name not in ['dev', 'prod']:
        print("指定的环境错误 可选值  dev , prod")
        sys.exit()

    if table_name == "nt_member":
        print("会员表在clickhouse中是定制过的大宽表, 不能这么处理")
        sys.exit()

    # 操作
    cls = DataXHJL(env_name=env_name, table_name_pg=table_name)
    # cls.env_name   = env_name  # 'dev'
    # cls.table_name_pg = table_name_pg  # 'nt_payment_type'

    if "create_ck_table" in sys.argv:
        cls.create_ck_table()
    elif "generate_datax_json_file" in sys.argv:
        cls.generate_datax_json_file()
    elif "datax_copy_data_to_ck" in sys.argv:
        cls.datax_copy_data_to_ck()
    elif "all" in sys.argv:
        cls.create_ck_table()
        cls.generate_datax_json_file()
        cls.datax_copy_data_to_ck()
    # cls.create_ck_table()
    # cls.generate_datax_json_file()
    # cls.datax_copy_data_to_ck()
    cls.close()