Trino&DBT不同类型数据库数据清洗统计
流程图
完整的数仓原型流程图
环境搭建
采用docker安装,还是一如既往使用我最喜欢的docker-compose;
Clickhouse和MySQL的安装中规中矩,也不考虑什么架构,直接拉取镜像打包即可
主要是:
1、dbt需要提前制作好镜像,参考:https://blog.csdn.net/huxiang19851114/article/details/126626213
2、trino需要提前配置好环境变量,参考:https://blog.csdn.net/huxiang19851114/article/details/125926092
PS:如果不想自己动手,可以直接下载我上传的文件目录:https://download.csdn.net/download/huxiang19851114/87570325,下载完后解压,除了dbt需要执行docker build生成镜像外,其他都不要改了:
docker build --tag dbt:v1.0.0 .
Docker Compose
version: '3.6'
services:
clickhouse:
image: yandex/clickhouse-server:20.8.7.15
container_name: clickhouse
ports:
- "8123:8123"
- "9000:9000"
mysql:
container_name: mysql
image: mysql:5.6
ports:
- 3306:3306
environment:
- MYSQL_ROOT_PASSWORD=root
- MYSQL_USER=huxiang
- MYSQL_PASSWORD=huxiang
volumes:
- /usr/local/dbt-trino-mysql-clickhouse/mysql/db:/var/lib/mysql
- /usr/local/dbt-trino-mysql-clickhouse/mysql/conf:/etc/mysql/mysql.conf.d
dbt:
image: dbt:v1.0.0
container_name: dbt
volumes:
- /usr/local/dbt-trino-mysql-clickhouse/dbt/app:/usr/app
- /usr/local/dbt-trino-mysql-clickhouse/dbt/dbt-profiles:/root/.dbt
tty: true
stdin_open: true
trino:
image: trinodb/trino:359
container_name: trino
volumes:
- /usr/local/dbt-trino-mysql-clickhouse/trino/etc/:/etc/trino/
ports:
- 10080:10080
tty: true
stdin_open: true
Trino配置Connector
这里我们需要配置两个Connector,分别指向源数据库的读(MySQL),目标数据库的写(ClickHouse)
#/usr/local/dbt-trino-mysql-clickhouse/trino/etc/catalog/source_dw_mysql.properties
connector.name=mysql
connection-url=jdbc:mysql://192.168.137.128:3306?enabledTLSProtocols=TLSv1.2&useSSL=false&serverTimezone=GMT%2B8
connection-user=root
connection-password=root
#/usr/local/dbt-trino-mysql-clickhouse/trino/etc/catalog/target_dw_clickhouse.properties,记得添加删除临时表权限
connector.name=clickhouse
connection-url=jdbc:clickhouse://192.168.137.128:8123
connection-user=default
connection-password=
allow-drop-table=true
#不配该项会报Cannot apply operator: varchar = varbinary,二进制字符串与字符串类型不能进行等值操作
clickhouse.map-string-as-varchar=true
重启Trino容器后能看到如下数据源connector,说明配置连接成功:
DBT配置
DBT对应的数据连接为trino,只不过各自指向source_dw_mysql和target_dw_clickhouse
配置工程加载文件
#vim /usr/local/dbt-trino-mysql-clickhouse/dbt/app/dbt_project/dbt_project.yml
#如果该文件已自动,则增加如下配置即可,作为源库:全局数据库服务和库名变量:
vars:
matomo_catalog: source_dw_matomo_mysql
matomo_schema: matomo
ngbilling_catalog: source_dw_ngbilling_mysql
ngbilling_schema: ngbilling
----------------------------------------------------------------------------
# Name your project! Project names should contain only lowercase characters
# and underscores. A good package name should reflect your organization's
# name or the intended use of these models
name: 'dbt_project'
version: '1.0.0'
config-version: 2
# This setting configures which "profile" dbt uses for this project.
profile: 'dbt_project'
# These configurations specify where dbt should look for different types of files.
# The `model-paths` config, for example, states that models in this project can be
# found in the "models/" directory. You probably won't need to change these!
model-paths: ["models/example"]
analysis-paths: ["analyses"]
test-paths: ["tests"]
seed-paths: ["seeds"]
macro-paths: ["macros"]
snapshot-paths: ["snapshots"]
target-path: "target" # directory which will store compiled SQL files
clean-targets: # directories to be removed by `dbt clean`
- "target"
- "dbt_packages"
vars:
matomo_catalog: source_dw_matomo_mysql
matomo_schema: matomo
ngbilling_catalog: source_dw_ngbilling_mysql
ngbilling_schema: ngbilling
# Configuring models
# Full documentation: https://docs.getdbt.com/docs/configuring-models
# In this example config, we tell dbt to build all models in the example/ directory
# as tables. These settings can be overridden in the individual model files
# using the `{{ config(...) }}` macro.
models:
# Config indicated by + and applies to all files under models/example/
example:
+materialized: table
配置Target数据源
# vim /usr/local/dbt-trino-mysql-clickhouse/dbt/dbt-profiles/profiles.yml
dbt_project:
target: dev
outputs:
dev:
type: trino
host: 192.168.137.128
user: root
port: 10080
database: target_dw_clickhouse
schema: tj
threads: 4
session_properties:
query_max_run_time: 5d
exchange_compression: True
配置转换脚本
脚本放到**/usr/local/dbt-trino-mysql-clickhouse/dbt/app/dbt_project/models/example**目录下:
用户活跃统计
统计前一天用户活跃性情况,包括三种类型:仅机时活跃,仅控制台活跃,机时控制台活跃
/*user_active_by_day.sql*/
{{ config(materialized='incremental') }}
with user_active_by_day as (
SELECT t1.act_day,
t1.user_id,
CASE
WHEN t2.console_user_id IS NULL THEN 'job'
ELSE 'job_console'
END AS type
FROM
(SELECT date(accounting_day) AS act_day,
user_id
FROM {{ var("ngbilling_catalog") }}.{{ var("ngbilling_schema") }}.b_daily_job
WHERE accounting_day >= (date (CURRENT_DATE) - interval '1' day) AND accounting_day < (date (CURRENT_DATE) - interval '0' day)
AND user_id IS NOT NULL
GROUP BY date(accounting_day),user_id) t1
LEFT JOIN
(SELECT date(visit_last_action_time) AS console_date,
user_id AS console_user_id
FROM {{ var("matomo_catalog") }}.{{ var("matomo_schema") }}.matomo_log_visit
WHERE visit_last_action_time >= (date (CURRENT_DATE) - interval '1' day) AND visit_last_action_time < (date (CURRENT_DATE) - interval '0' day)
AND user_id IS NOT NULL
GROUP BY date(visit_last_action_time),user_id) t2 ON t1.user_id=t2.console_user_id
AND t1.act_day=t2.console_date
UNION
SELECT t1.console_date AS act_day,
t1.console_user_id AS user_id,
CASE
WHEN t2.user_id IS NULL THEN 'console'
ELSE 'job_console'
END AS type
FROM
(SELECT date(visit_last_action_time) AS console_date,
user_id AS console_user_id
FROM {{ var("matomo_catalog") }}.{{ var("matomo_schema") }}.matomo_log_visit
WHERE visit_last_action_time >= (date (CURRENT_DATE) - interval '1' day) AND visit_last_action_time < (date (CURRENT_DATE) - interval '0' day)
AND user_id IS NOT NULL
GROUP BY date(visit_last_action_time),user_id) t1
LEFT JOIN
(SELECT date(accounting_day) AS act_day,
user_id
FROM {{ var("ngbilling_catalog") }}.{{ var("ngbilling_schema") }}.b_daily_job
WHERE accounting_day >= (date (CURRENT_DATE) - interval '1' day) AND accounting_day < (date (CURRENT_DATE) - interval '0' day)
AND user_id IS NOT NULL
GROUP BY date(accounting_day),user_id) t2 ON t1.console_date=t2.act_day
AND t1.console_user_id=t2.user_id
)
SELECT * FROM user_active_by_day
用户类型统计
用户类型包括:是否有组织用户,是否付费主账号,是否付费用户
这里我们要分步处理
首先执行用户类型统计初始化脚本,手动执行一次
脚本:user_type.sql
{{ config(materialized='incremental') }}
with user_type as (
SELECT
e.*,
CONCAT(e.user_id,'-',e.present_type,'-',e.pay_type,'-',e.group_type,'-',date_format(e.start_date,'%Y-%m-%d')) as uni_key
FROM(
SELECT
a.user_id,
CASE
WHEN EXISTS (
SELECT 1 FROM (
SELECT t1.user_id user_id from {{ var("ngbilling_catalog") }}.{{ var("ngbilling_schema") }}.b_contract t1 LEFT JOIN {{ var("ngbilling_catalog") }}.{{ var("ngbilling_schema") }}.b_contract_item t2 ON t1.id = t2.contract_id WHERE t2.is_present = 0 and t2.received_payments > 0 GROUP BY t1.user_id
UNION
SELECT u2.user_id user_id from {{ var("ngbilling_catalog") }}.{{ var("ngbilling_schema") }}.b_user as u1, {{ var("ngbilling_catalog") }}.{{ var("ngbilling_schema") }}.b_user as u2 where u1.group_id=u2.group_id AND u1.user_id != u2.user_id AND
EXISTS(
SELECT 1 FROM (SELECT t1.user_id from {{ var("ngbilling_catalog") }}.{{ var("ngbilling_schema") }}.b_contract t1 LEFT JOIN {{ var("ngbilling_catalog") }}.{{ var("ngbilling_schema") }}.b_contract_item t2 ON t1.id = t2.contract_id WHERE t2.is_present = 0 and t2.received_payments > 0 GROUP BY t1.user_id) c WHERE c.user_id = u1.user_id
)
) d WHERE a.user_id = d.user_id
)
THEN
'pay'
ELSE
'no pay'
END as present_type,
CASE
WHEN EXISTS(
SELECT 1 FROM(
SELECT t1.user_id FROM {{ var("ngbilling_catalog") }}.{{ var("ngbilling_schema") }}.b_user t1 , {{ var("ngbilling_catalog") }}.{{ var("ngbilling_schema") }}.b_group t2 WHERE t1.user_id=t2.pay_user_id AND t2.pay_user_id IS NOT NULL
)b WHERE a.user_id = b.user_id
)
THEN
'master'
ELSE
'slave'
END as pay_type,
CASE
WHEN EXISTS(
SELECT 1 FROM(
SELECT t1.user_id FROM {{ var("ngbilling_catalog") }}.{{ var("ngbilling_schema") }}.b_user t1 WHERE t1.group_id IS NULL
)b WHERE a.user_id = b.user_id
)
THEN
'no group'
ELSE
'group'
END as group_type,
CURRENT_DATE as start_date,
DATE(null) as end_date
FROM {{ var("ngbilling_catalog") }}.{{ var("ngbilling_schema") }}.b_user a
)e
)
SELECT * FROM user_type
执行命令:
docker exec -it dbt /bin/bash
root@c19e30875880:/usr/app# cd dbt_project/
root@c19e30875880:/usr/app/dbt_project# dbt run
03:00:18 Running with dbt=1.0.0
03:00:18 [WARNING]: Configuration paths exist in your dbt_project.yml file which do not apply to any resources.
There are 1 unused configuration paths:
- models.example
03:00:18 Found 1 model, 0 tests, 0 snapshots, 0 analyses, 167 macros, 0 operations, 0 seed files, 0 sources, 0 exposures, 0 metrics
03:00:18
03:00:26 Concurrency: 4 threads (target='dev')
03:00:26
03:00:26 1 of 1 START incremental model tj.user_type..................................... [RUN]
03:01:03 1 of 1 OK created incremental model tj.user_type................................ [SUCCESS in 36.85s]
03:01:03
03:01:03 Finished running 1 incremental model in 44.89s.
03:01:03
03:01:03 Completed successfully
03:01:03
03:01:03 Done. PASS=1 WARN=0 ERROR=0 SKIP=0 TOTAL=1
执行成功后将user_type.sql脚本内容替换如下,得到下一次统计和历史数据的差集
重启dbt,通过crontab 定时执行dbt run调度
{{ config(engine='MergeTree()',materialized='incremental',unique_key='uni_key',incremental_strategy='delete+insert') }}
with query_user_type as (
SELECT
e.*,
CONCAT(e.user_id,'-',e.present_type,'-',e.pay_type,'-',e.group_type,'-',date_format(e.start_date,'%Y-%m-%d')) as uni_key FROM(
SELECT
a.user_id,
CASE
WHEN EXISTS (
SELECT 1 FROM (
SELECT t1.user_id user_id from {{ var("ngbilling_catalog") }}.{{ var("ngbilling_schema") }}.b_contract t1 LEFT JOIN {{ var("ngbilling_catalog") }}.{{ var("ngbilling_schema") }}.b_contract_item t2 ON t1.id = t2.contract_id WHERE t2.is_present = 0 and t2.received_payments > 0 GROUP BY t1.user_id
UNION
SELECT u2.user_id user_id from {{ var("ngbilling_catalog") }}.{{ var("ngbilling_schema") }}.b_user as u1, {{ var("ngbilling_catalog") }}.{{ var("ngbilling_schema") }}.b_user as u2 where u1.group_id=u2.group_id AND u1.user_id != u2.user_id AND
EXISTS(
SELECT 1 FROM (SELECT t1.user_id from {{ var("ngbilling_catalog") }}.{{ var("ngbilling_schema") }}.b_contract t1 LEFT JOIN {{ var("ngbilling_catalog") }}.{{ var("ngbilling_schema") }}.b_contract_item t2 ON t1.id = t2.contract_id WHERE t2.is_present = 0 and t2.received_payments > 0 GROUP BY t1.user_id) c WHERE c.user_id = u1.user_id
)
) d WHERE a.user_id = d.user_id
)
THEN
'pay'
ELSE
'no pay'
END as present_type,
CASE
WHEN EXISTS(
SELECT 1 FROM(
SELECT t1.user_id FROM {{ var("ngbilling_catalog") }}.{{ var("ngbilling_schema") }}.b_user t1 , {{ var("ngbilling_catalog") }}.{{ var("ngbilling_schema") }}.b_group t2 WHERE t1.user_id=t2.pay_user_id AND t2.pay_user_id IS NOT NULL
)b WHERE a.user_id = b.user_id
)
THEN
'master'
ELSE
'slave'
END as pay_type,
CASE
WHEN EXISTS(
SELECT 1 FROM(
SELECT t1.user_id FROM {{ var("ngbilling_catalog") }}.{{ var("ngbilling_schema") }}.b_user t1 WHERE t1.group_id IS NOT NULL
)b WHERE a.user_id = b.user_id
)
THEN
'group'
ELSE
'no group'
END as group_type,
CURRENT_DATE as start_date,
DATE(null) as end_date
FROM {{ var("ngbilling_catalog") }}.{{ var("ngbilling_schema") }}.b_user a
)e
),
add_or_update_user_type as (
SELECT
b.user_id,
b.present_type,
b.pay_type,
b.group_type,
b.start_date,
date(CURRENT_DATE) as end_date,
b.uni_key
FROM
query_user_type a
RIGHT JOIN user_type b ON a.user_id = b.user_id
WHERE
a.group_type != b.group_type
OR a.pay_type != b.pay_type
OR a.present_type != b.present_type
GROUP BY
b.user_id,b.present_type,b.pay_type,b.group_type,b.start_date,b.uni_key
UNION ALL
SELECT
a.user_id,
a.present_type,
a.pay_type,
a.group_type,
CURRENT_DATE as start_date,
DATE(null) as end_date,
a.uni_key
FROM
query_user_type a
WHERE NOT EXISTS(
SELECT 1 FROM user_type b WHERE a.user_id = b.user_id
)
OR EXISTS(
SELECT 1 FROM(
SELECT
a.user_id
FROM
query_user_type a
RIGHT JOIN
user_type b
ON a.user_id = b.user_id
WHERE
a.group_type != b.group_type
OR a.pay_type != b.pay_type
OR a.present_type != b.present_type
)c WHERE c.user_id = a.user_id
)
GROUP BY a.user_id,a.present_type,a.pay_type,a.group_type,a.uni_key
)
select * from add_or_update_user_type
执行后可以在target clickhouse生成对应的表及统计的数据:
表字段说明:
ngbilling_console_user_active(用户活跃表)
字段名 | 描述 |
---|---|
act_day | 统计时间 |
user_id | 用户id |
type | 类型:仅机时活跃,仅控制台活跃,机时控制台活跃 |
user_type(用户类型表)
该表保存用户类型信息,一个用户可以有多条数据,跟用户活跃表匹配时取start_date,end_date区间作为判断条件:
比如user_active_by_day.act_day = 2023-03-16
对应user_type(start_date,end_date)有三条数据,分别时间为:
2023-03-15 ~ 2023-03-17;
2023-03-18 ~ 2023-03-19;
2023-03-20 ~ ;
那么应该取2023-03-15 ~ 2023-03-17这个时间段对应的用户类型
字段名 | 描述 |
---|---|
user_id | 用户id |
present_type | 类型:付费用户,试算用户 |
pay_type | 类型:付费主账号,付费子账号 |
group_type | 类型:有组织用户,无组织用户 |
start_date | 用户类型有效开始时间 |
end_date | 用户类型有效结束时间,为空则表示至今 |
----以下为业务实现,具体实现方式可以参考如下使用dbt内嵌脚本创建统计结果表,也可以通过superset直连trino,编写统计脚本即时查询,不需要部署—
业务统计
根据具体业务来编写,目前需要统计的信息为:
以日期作为统计维度,统计如下信息:
当日提交作业用户数,当日提交作业付费用户数,当日提交作业付费主账号数,当日提交作业付费子账号数,当日提交作业试算用户数,当日提交作业无组织用户数,当日提交作业未登录控制台用户数,当日提交作业登录控制台用户数,当日登录控制台用户数,当日登录控制台的付费用户数,当日登录控制台的付费主账号数,当日登录控制台的付费子账号数,当日登录控制台试算用户数,当日登录控制台无组织用户数
我们以上面用户活跃、用户类型两个基础表作为统计依据,统计前一天的用户信息,实现如下dbt脚本
/*user_active_by_day_cnt.sql*/
{{ config(materialized='incremental') }}
with user_active_by_day_cnt as (
SELECT
(date (CURRENT_DATE) - interval '1' day) sub_date,
SUM(CASE WHEN c.type = 'job' OR c.type = 'job_console' THEN 1 ELSE 0 END) as job_user_cnt,
SUM(CASE WHEN c.type = 'job' AND c.present_type = 'pay' THEN 1 ELSE 0 END) as job_pay_user_cnt,
SUM(CASE WHEN c.type = 'job' AND c.present_type = 'no pay' THEN 1 ELSE 0 END) as job_not_pay_user_cnt,
SUM(CASE WHEN c.type = 'job' AND c.pay_type = 'master' THEN 1 ELSE 0 END) as job_master_pay_user_cnt,
SUM(CASE WHEN c.type = 'job' AND c.pay_type = 'slave' THEN 1 ELSE 0 END) as job_salve_pay_user_cnt,
SUM(CASE WHEN c.type = 'job' AND c.group_type = 'no group' THEN 1 ELSE 0 END) as job_not_group_user_cnt,
SUM(CASE WHEN c.type = 'job_console' THEN 1 ELSE 0 END) as job_login_console_user_cnt,
SUM(CASE WHEN c.type = 'job' THEN 1 ELSE 0 END) as job_not_login_console_user_cnt,
SUM(CASE WHEN c.type = 'console' OR c.type = 'job_console' AND c.group_type = 'no group' THEN 1 ELSE 0 END) as console_login_user_cnt,
SUM(CASE WHEN c.type = 'console' AND c.present_type = 'pay' THEN 1 ELSE 0 END) as console_login_pay_user_cnt,
SUM(CASE WHEN c.type = 'console' AND c.present_type = 'no pay' THEN 1 ELSE 0 END) as console_login_not_pay_user_cnt,
SUM(CASE WHEN c.type = 'console' AND c.pay_type = 'master' THEN 1 ELSE 0 END) as console_login_master_pay_user_cnt,
SUM(CASE WHEN c.type = 'console' AND c.pay_type = 'slave' THEN 1 ELSE 0 END) as console_login_slave_pay_user_cnt,
SUM(CASE WHEN c.type = 'console' AND c.group_type = 'no group' THEN 1 ELSE 0 END) as console_login_not_group_user_cnt
FROM (
SELECT
a.*,
b.present_type,
b.pay_type ,
b.group_type
FROM {{ ref('user_active_by_day') }} a
left join
user_type b on a.user_id = b.user_id
WHERE a.act_day = (date (CURRENT_DATE) - interval '1' day) AND ((a.act_day BETWEEN DATE(b.start_date) AND DATE(b.end_date)) OR (a.act_day >=DATE(b.start_date) AND b.end_date IS NULL))
)AS c
)
SELECT * FROM user_active_by_day_cnt
执行生成如下表结构: