#!/usr/bin/python
#-*- encoding:utf-8 -*-
import json
import os
import sys
import psycopg2
from clickhouse_driver import Client
from clickhouse_driver import connect
reload(sys)
sys.setdefaultencoding('utf-8')
"""
yum install postgresql-devel*
yum install python-psycopg2 # python35-psycopg2 for py3
pip install clickhouse_drivers
脚本文件 解决问题:
1. 自定义表名,自定义环境 可以将PG表结构 同步到CK中
2. 支持按不同环境, 自动创建 datax 同步数据需要的 json 文件
"""
# postgres 数据库连接信息
pg_dict = {
'prod': {"database": "数据库名", "user": "用户", "password": "密码", "host": "主机IP", "port": "5432"},
'dev': {"database": "数据库名", "user": "用户", "password": "密码", "host": "主机IP", "port": "5432"}
}
# clickhouse 数据库连接信息
ck_dict = {
'prod': {"database": "数据库", "user": "用户", "password": "密码", "host": "主机IP", "port": "8123"},
'dev': {"database": "数据库", "user": "用户", "password": "密码", "host": "主机IP", "port": "8123"}
}
class DataXHJL:
jdbcurl_dict = {"pg": "", "ck": ""} # JDBC URL 配置
cur_pg, conn_pg = None, None
cur_ck, conn_ck = None, None
env_name = 'dev'
table_name_pg = "" # 需要操作的表名 postgres 的表名
table_name_ck = "" # clickhouse 表名
datax_json_path = "" # datax 同步数据 需要使用的 json 文件路径 /usr/local/datax/bin/nt_member.json
def __init__(self, env_name, table_name_pg):
"""
连接 postgres , clickhouse DB
"""
self.env_name = env_name
self.table_name_pg = table_name_pg
self.table_name_ck = table_name_pg.replace("nt_", "ck_", 1)
# jdbcurl 配置
self.jdbcurl_dict.update({
# "pg": "jdbc:postgresql://xxxx:5432/hh_test",
# "ck": "jdbc:clickhouse://127.0.0.1:8123/hh_test",
"pg": "jdbc:postgresql://%s:%s/%s" % (
pg_dict.get(self.env_name, {}).get('host'),
pg_dict.get(self.env_name, {}).get('port'),
pg_dict.get(self.env_name, {}).get('database')
),
"ck": "jdbc:clickhouse://%s:%s/%s" % (
ck_dict.get(self.env_name, {}).get('host'),
ck_dict.get(self.env_name, {}).get('port'),
ck_dict.get(self.env_name, {}).get('database')
),
})
# https://www.cnblogs.com/hello-wei/p/9685998.html
# self.cur_pg = pg.connect(dbname=pg_dict.get(self.env_name, {}).get('database'),
# host=pg_dict.get(self.env_name, {}).get('database'),
# user=pg_dict.get(self.env_name, {}).get('database'),
# passwd=pg_dict.get(self.env_name, {}).get('database'))
self.conn_pg = conn = psycopg2.connect(database=pg_dict.get(self.env_name, {}).get('database'),
user=pg_dict.get(self.env_name, {}).get('user'),
password=pg_dict.get(self.env_name, {}).get('password'),
host=pg_dict.get(self.env_name, {}).get('host'),
port=pg_dict.get(self.env_name, {}).get('port'))
# 建立游标,用来执行数据库操作
self.cur_pg = conn.cursor()
# 执行SQL命令
# self.cur_pg.execute("CREATE TABLE test_conn(id int, name text)")
# self.cur_pg.execute("INSERT INTO test_conn values(1,'haha')")
# self.conn_pg.commit() # 提交SQL命令
# self.cur_pg.close() # 关闭游标
# self.conn_pg.close() # 关闭数据库连接
# 账号:密码@主机名:端口号/数据库
# conn = connect('clickhouse://default:123456@10.10.10.1:9000/hh_test')
self.conn_ck = connect('clickhouse://default:%s@%s:9000/%s' % (
ck_dict.get(self.env_name, {}).get('password'),
ck_dict.get(self.env_name, {}).get('host'),
ck_dict.get(self.env_name, {}).get('database')
))
self.cur_ck = self.conn_ck.cursor()
# sql = 'show databases'
# ans = self.cur_ck.execute(sql)
# print(ans)
def close(self):
self.cur_pg.close() # 关闭游标
self.conn_pg.close() # 关闭数据库连接
self.cur_ck.close() # 关闭游标
self.conn_ck.close() # 关闭数据库连接
def get_pg_table_columns(self):
"""
将 postgres 表中字段 使用 "", 连接在一起
"""
columns = []
hql = "SELECT column_name from information_schema.columns where table_name = '%s' and table_schema = 'public' order by ordinal_position ASC" % self.table_name_pg
self.cur_pg.execute(hql)
rows = self.cur_pg.fetchall()
for row in rows:
# TODO: 字段名称是大写的, datax 执行同步时 报错(可能是datax的BUG); 这些字段不影响业务, 做跳过处理
# nt_sale_shop.SMH_account nt_product.SAP_description,SAP_code,SAP_description2,SAP_code2
if row[0] in ["SMH_account", "SAP_description", "SAP_code", "SAP_description2", "SAP_code2"]:
continue
columns.append(row[0])
return columns
def get_pg_table_columns_comment(self):
"""
将 postgres 表中字段 使用 "", 连接在一起
"""
column_dict = {}
self.cur_pg.execute("""
SELECT
a.attname as "字段名",
col_description(a.attrelid,a.attnum) as "注释",
concat_ws('',t.typname,SUBSTRING(format_type(a.atttypid,a.atttypmod) from '\(.*\)')) as "字段类型"
FROM
pg_class as c,
pg_attribute as a,
pg_type as t
WHERE
c.relname = '%s'
and a.atttypid = t.oid
and a.attrelid = c.oid
and a.attnum>0;
""" % self.table_name_pg)
rows = self.cur_pg.fetchall()
for row in rows:
column_dict[row[0]] = {"comment": row[1] or "", "type": row[2] or ""}
return column_dict
def create_ck_table(self):
# 1. 生成clickhouse创建表的语句
self.cur_pg.execute("""
select
concat(
'create table ',
'%s',
'(',
'ts_ms Int64 DEFAULT now(), insert_time DateTime DEFAULT now(), ',
string_agg(
concat(
column_name,
' ',
CASE when is_nullable = 'YES' and column_name != 'create_date' THEN 'Nullable(' END,
CASE
when data_type = 'integer' then 'Int32'
when data_type = 'smallint' then 'Int16'
when data_type = 'bigint' then 'Int64'
when data_type = 'numeric' then 'Float64'
when data_type = 'double precision' then 'Float64'
when data_type = 'character varying' then 'String'
when data_type = 'character' then 'String'
when data_type = 'text' then 'String'
when data_type = 'boolean' then 'UInt8'
when data_type = 'date' then 'Date'
when data_type = 'timestamp without time zone' then 'Datetime'
when data_type = 'time without time zone' then 'String'
else concat('###################', data_type)
END,
CASE when is_nullable = 'YES' and column_name != 'create_date' THEN ')' END
),
', ' order by ordinal_position
),
') ',
'ENGINE = ReplacingMergeTree() '
'PARTITION BY toYYYYMM(create_date) '
'ORDER BY [id] '
) as table_ddl
from information_schema.columns
where true
and table_name = '%s'
and table_schema = 'public'
group by table_name ;
""" % (self.table_name_ck, self.table_name_pg))
rows = self.cur_pg.fetchone()
ck_table_create_sql = rows[0]
print("1. 生成clickhouse创建表的语句 -- ok")
# print(rows[0])
# 2. clickhouse中删除已存在的表
self.cur_ck.execute("DROP TABLE IF EXISTS %s" % self.table_name_ck)
print("2. clickhouse中删除已存在的表 -- ok")
# 3. clickhouse中创建表
self.cur_ck.execute(ck_table_create_sql)
print("3. clickhouse中创建表 -- ok")
# 4. clickhouse中修改字段备注
column_dict = self.get_pg_table_columns_comment()
for column_name, _item in column_dict.items():
self.cur_ck.execute("ALTER TABLE `%s` COMMENT COLUMN `%s` '%s';" % (self.table_name_ck, column_name, _item.get('comment', '')))
print("4. clickhouse中修改字段备注 -- ok")
def generate_datax_json_file(self, querySql=None):
"""
@param querySql 没有值 表示 根据PG原表 做为数据源
有值 表示 根据PG查询的SQL 作为数据源
"""
# 1. 生成 datax 需要的json文件
json_dict = {
"job": {
"content": [{
"reader": {
"name": "postgresqlreader",
"parameter": {
"column": ["id", "create_uid", "create_date", "write_date", "write_uid", "account", "account_type", "wx_app_id", "applet_openid", "member_id"],
"connection": [{
"jdbcUrl": ["jdbc:postgresql://10.200.100.1:5432/hh_test"],
"table": ["nt_member_account"]
}],
"username": pg_dict.get(self.env_name, {}).get('user'),
"password": pg_dict.get(self.env_name, {}).get('password')
}
},
"writer": {
"name": "clickhousewriter",
"parameter": {
"username": ck_dict.get(self.env_name, {}).get("user"),
"password": ck_dict.get(self.env_name, {}).get("password"),
#"column": ["id", "create_uid", "create_date", "write_date", "write_uid", "account", "account_type", "wx_app_id", "applet_openid", "member_id"],
"connection": [{
"jdbcUrl": "jdbc:clickhouse://127.0.0.1:8123/hh_test",
"table": ["ck_member_account"]
}],
# "preSql": [
# "TRUNCATE TABLE ck_member_account",
# "drop view if EXISTS `view_ck_member_account`",
# "create view `view_ck_member_account` as select * from (select row_number() OVER www as row, * from ck_member_account WINDOW www as (partition by id order by insert_time desc) ) where row = 1"
# ]
}
}
}],
"setting": {
"speed": {
"channel": 1
}
}
}
}
if querySql:
# postgres 配置
json_dict['job']['content'][0]['reader']['parameter'].update({
"connection": [{
"jdbcUrl": [self.jdbcurl_dict.get('pg')],
"querySql": [querySql]
}]
})
# clickhouse 配置
json_dict['job']['content'][0]['writer']['parameter'].update({
"column": ["*"],
"connection": [{
"jdbcUrl": self.jdbcurl_dict.get('ck'),
"table": [self.table_name_ck]
}],
"preSql": [
"TRUNCATE TABLE %s" % self.table_name_ck,
]
})
else:
columns = self.get_pg_table_columns()
# print(columns)
# 替换 JSON 里面的配置
# postgres 配置
json_dict['job']['content'][0]['reader']['parameter'].update({
"column": columns,
"connection": [{
"jdbcUrl": [self.jdbcurl_dict.get('pg')],
"table": [self.table_name_pg]
}]
})
# clickhouse 配置
json_dict['job']['content'][0]['writer']['parameter'].update({
"column": columns,
"connection": [{
"jdbcUrl": self.jdbcurl_dict.get('ck'),
"table": [self.table_name_ck]
}],
"preSql": [
"TRUNCATE TABLE %s" % self.table_name_ck,
"drop view if EXISTS `view_%s`" % self.table_name_ck,
"create view `view_%s` as select * from (select row_number() OVER www as row, * from %s WINDOW www as (partition by id order by insert_time desc) ) where row = 1" % (self.table_name_ck, self.table_name_ck)
]
})
self.datax_json_path = "./%s_%s.json" % (self.table_name_ck, self.env_name)
self.save_file(path=self.datax_json_path, item=json_dict)
print("1. 生成 datax 需要的json文件: %s -- ok " % self.datax_json_path)
def datax_copy_data_to_ck(self):
"""
# 执行 datax 复制数据
# /usr/local/datax/bin/datax.py
"""
print("datax执行数据全量同步操作 ...")
if not self.datax_json_path:
self.datax_json_path = "./%s_%s.json" % (self.table_name_ck, self.env_name)
os.system("python /usr/local/datax/bin/datax.py %s" % self.datax_json_path)
def save_file(self, path, item):
# 先将字典对象转化为可写入文本的字符串
item = json.dumps(item, indent=4)
try:
if os.path.exists(path):
os.remove(path)
with open(path, "w") as f:
f.write(item)
print("创建 datax 同步数据 需要的json文件 -- ok")
except Exception as e:
print("write error==>", e)
if __name__ == '__main__':
help_doc = """
使用方法:
1. 将PG表结构复制到CK中 python datax_hh.py create_ck_table <环境> <PG表名>
2. 生成datax需要的json文件 python datax_hh.py generate_datax_json_file <环境> <PG表名>
3. 调用datax.py将全量数据同步到CK中 python datax_hh.py datax_copy_data_to_ck <环境> <PG表名>
4. 执行以上合并操作 python datax_hh.py all <环境> <PG表名>
5. 导入会员分组涉及的4张表 python datax_hh.py ck_member_group <环境> <*表名看以下说明*>
示例: 将测试环境pg指定表导入ck测试库中: python datax_hh.py all dev nt_payment_type
示例: 将正式环境pg指定表导入ck正式库中: python datax_hh.py all prod nt_payment_type
"""
"""
# 会员分组: 使用到的4张大宽表
python datax_hjl.py ck_member_group prod ck_member
python datax_hjl.py all dev nt_sale_order
python datax_hjl.py all dev nt_sale_order_line
"""
if "help" in sys.argv or "-h" in sys.argv or len(sys.argv) == 1:
print(help_doc)
sys.exit()
type, env_name, table_name = sys.argv[-3], sys.argv[-2], sys.argv[-1]
if env_name not in ['dev', 'prod']:
print("指定的环境错误 可选值 dev , prod")
sys.exit()
if table_name == "nt_member":
print("会员表在clickhouse中是定制过的大宽表, 不能这么处理")
sys.exit()
# 操作
cls = DataXHJL(env_name=env_name, table_name_pg=table_name)
# cls.env_name = env_name # 'dev'
# cls.table_name_pg = table_name_pg # 'nt_payment_type'
if "create_ck_table" in sys.argv:
cls.create_ck_table()
elif "generate_datax_json_file" in sys.argv:
cls.generate_datax_json_file()
elif "datax_copy_data_to_ck" in sys.argv:
cls.datax_copy_data_to_ck()
elif "all" in sys.argv:
cls.create_ck_table()
cls.generate_datax_json_file()
cls.datax_copy_data_to_ck()
# cls.create_ck_table()
# cls.generate_datax_json_file()
# cls.datax_copy_data_to_ck()
cls.close()
python生成datax同步数据需要的json格式文件
最新推荐文章于 2024-08-10 13:32:35 发布