Oneid 图计算落地方案

CesarChoy

已于 2024-01-31 10:01:21 修改

阅读量492

点赞数 4

文章标签：大数据用户画像

于 2023-11-15 18:50:01 首次发布

本文链接：https://blog.csdn.net/weixin_42687074/article/details/134426672

版权

一、前文

oneid 是用户画像的核心，此文提供图计算的具体方案。

二、方案

注意事项：

1. 业务存在解绑信息，当不与其他业务系统产生关联时，沿用旧oneid。

2. oneid 需要自增，下游系统会用到bitmap等数据类型，有利于人群包计算。

3. 标签与oneid 解耦，最后通过映射表关联，可大大调高计算效率。

三、开发

输入数据处理	大宽表(原始)	100%	dwd_portrait_oneid_user_d_a
	映射码表	100%	dwd_portrait_oneid_user_col_d_a
	映射id码表	100%	dwd_portrait_oneid_user_col_relation_d_a
	大宽表(编码)	100%	dwd_portrait_oneid_user_encode_d_a
	图输入点边关系表	100%	dwd_portrait_oneid_graph_input_d_a
输出数据处理	图输出点边关系表	100%	dwd_portrait_oneid_graph_output_d_a
	映射oneid码表	100%	dwd_portrait_oneid_user_col_relation_minid_d_a
	大宽表(解码)	100%	dwd_portrait_oneid_user_decode_d_a
	结果表	100%	dwd_portrait_oneid_user_total_d_a

3.1 生成大宽表，拉宽不同业务的用户信息/关系；

CREATE TABLE IF NOT EXISTS yxp.dwd_portrait_oneid_user_d_a
(
    union_id    STRING COMMENT 'union_id'
    ,phone      STRING COMMENT '电话号码'
    ,parent_id  STRING COMMENT '家长账号id'
    ,student_id STRING COMMENT '学生账号id'
    ,machine_no STRING COMMENT '内码'
    ,flowcode   STRING COMMENT '外码'
)
COMMENT '用户画像基础明细表'
PARTITIONED BY 
(
    ds          STRING COMMENT '分区字段'
)
;

INSERT OVERWRITE TABLE yxp.dwd_portrait_oneid_user_d_a PARTITION (ds = '${bizdate}')
SELECT  union_id
        ,phone
        ,parent_id
        ,student_id
        ,machine_no
        ,flowcode
FROM    (
            SELECT  union_id
                    ,phone
                    ,NULL AS parent_id
                    ,NULL AS student_id
                    ,NULL AS machine_no
                    ,NULL AS flowcode
            FROM    dim_operation_clue_channel_d_a_v4
            WHERE   ds = '${bizdate}'
            AND     COALESCE(union_id,phone) <> ''
            UNION ALL
            SELECT  wx_union_id AS union_id
                    ,phone_num AS phone
                    ,user_id AS parent_id
                    ,NULL AS student_id
                    ,NULL AS machine_no
                    ,NULL AS flowcode
            FROM    dim_public_parent_user_d_a
            WHERE   ds = '${bizdate}'
            UNION ALL
            SELECT  NULL AS union_id
                    ,NULL AS phone
                    ,user_id AS parent_id
                    ,NULL AS student_id
                    ,machine_no
                    ,NULL AS flowcode
            FROM    dim_pad_device_parent_bind_d_a
            WHERE   ds = '${bizdate}'
            UNION ALL
            SELECT  NULL AS union_id
                    ,NULL AS phone
                    ,NULL AS parent_id
                    ,NULL AS student_id
                    ,machine_no
                    ,NULL AS flowcode
            FROM    dim_pad_device_d_a
            WHERE   ds = '${bizdate}'
            UNION ALL
            SELECT  NULL AS union_id
                    ,NULL AS phone
                    ,NULL AS parent_id
                    ,NULL AS student_id
                    ,softwarecode AS machine_no
                    ,flowcode
            FROM    ods_pad_bigdata_softwarecodeflowcodebind_mid1_d_a
            WHERE   ds = '${bizdate}'
            UNION ALL
            SELECT  NULL AS union_id
                    ,phone_num AS phone
                    ,NULL AS parent_id
                    ,user_id AS student_id
                    ,NULL AS machine_no
                    ,NULL AS flowcode
            FROM    dim_pad_user_d_a
            WHERE   ds = '${bizdate}'
            UNION ALL
            SELECT  NULL AS union_id
                    ,NULL AS phone
                    ,NULL AS parent_id
                    ,user_id AS student_id
                    ,machine_no
                    ,NULL AS flowcode
            FROM    dim_pad_device_account_mapping_d_a
            WHERE   ds = '${bizdate}'
            UNION ALL
            SELECT  NULL AS union_id
                    ,NULL AS phone
                    ,phone_num AS parent_id
                    ,NULL AS student_id
                    ,machine_no
                    ,sn AS flowcode
            FROM    tab_warranty
            WHERE   ds = '${bizdate}'
        ) 
;

3.2映射码表(提取字段为oneid做准备)

from odps import ODPS
# (1) 参数：
# 表名
input_tb = 'dwd_portrait_oneid_user_d_a'
output_tb = 'dwd_portrait_oneid_user_col_d_a'
# CREATE TABLE IF NOT EXISTS yxp.dwd_portrait_oneid_user_col_d_a
# (
#     col_name   STRING COMMENT '列名'
#     ,col_value STRING COMMENT '列值'
# )
# COMMENT '用户画像列表'
# PARTITIONED BY 
# (
#     ds         STRING COMMENT '分区字段'
# )
# ;

# 分区名
ds = args['bizdate']

# (2) 组装SQL
# 提取列名 和 列值
t = o.get_table(input_tb)
col_sql = []
for col in t.schema.names :
    col_sql.append(" select '{col}' AS col_name,{col} AS col_value from {input_tb} where ds = '{ds}' and {col} <> '' ".format(col=col,input_tb=input_tb,ds=ds))
union_sql = ' union all '.join(col_sql)
alldata_sql = "select col_name,col_value from ({union_sql})t1 group by col_name,col_value".format(union_sql=union_sql)
final_sql = " INSERT OVERWRITE TABLE {output_tb} PARTITION (ds = '{ds}') {alldata_sql} ".format(output_tb=output_tb,alldata_sql=alldata_sql,ds=ds)

# (3) 写入数据
print(final_sql)
o.execute_sql(final_sql)

3.3映射id码表(生成oneid)

CREATE TABLE IF NOT EXISTS yxp.dwd_portrait_oneid_user_col_relation_d_a
(
    id         BIGINT COMMENT '映射id'
    ,col_name  STRING COMMENT '字段名'
    ,col_value STRING COMMENT '字段值'
)
COMMENT '用户画像id映射表'
PARTITIONED BY 
(
    ds         STRING COMMENT '分区字段'
)
;

WITH tmp_today AS 
(
    SELECT  col_name
            ,col_value
    FROM    dwd_portrait_oneid_user_col_d_a
    WHERE   ds = '${bizdate}'
)
,tmp_yes AS 
(
    SELECT  id
            ,col_name
            ,col_value
    FROM    dwd_portrait_oneid_user_col_relation_d_a
    WHERE   ds = TO_CHAR(DATE_ADD(TO_DATE('${bizdate}','yyyymmdd'),-1),'yyyyMMdd')
)
,tmp_today_added AS 
(
    SELECT  t1.col_name
            ,t1.col_value
            ,ROW_NUMBER() OVER (ORDER BY t1.col_name,t1.col_value ASC ) AS rk
    FROM    tmp_today t1
    LEFT JOIN tmp_yes t2
    ON      t1.col_name = t2.col_name
    AND     t1.col_value = t2.col_value
    WHERE   t2.col_name IS NULL -- 新增
)
INSERT OVERWRITE TABLE yxp.dwd_portrait_oneid_user_col_relation_d_a PARTITION (ds = '${bizdate}')
SELECT  rk + COALESCE((
            SELECT  MAX(id) maxid
            FROM    tmp_yes
        ) ,0) AS id
        ,col_name
        ,col_value
FROM    tmp_today_added -- 今天
UNION ALL
SELECT  id
        ,col_name
        ,col_value
FROM    tmp_yes -- 昨天
;

3.4大宽表(编码)，将原来的字段值隐射成oneid用于图计算.

from odps import ODPS
# (1) 参数：
# 表名
input_tb = 'dwd_portrait_oneid_user_d_a'
id_tb = 'dwd_portrait_oneid_user_col_relation_d_a'
output_tb = 'dwd_portrait_oneid_user_encode_d_a'
# 输入输出表字段要保持一致
# CREATE TABLE IF NOT EXISTS yxp.dwd_portrait_oneid_user_encode_d_a
# (
#     union_id    BIGINT COMMENT 'union_id'
#     ,phone      BIGINT COMMENT '电话号码'
#     ,parent_id  BIGINT COMMENT '家长账号id'
#     ,student_id BIGINT COMMENT '学生账号id'
#     ,machine_no BIGINT COMMENT '内码'
#     ,flowcode   BIGINT COMMENT '外码'
# )
# COMMENT '用户画像基础明细表'
# PARTITIONED BY 
# (
#     ds          STRING COMMENT '分区字段'
# )
# ;

# 分区名
ds = args['bizdate']

# (2) 组装SQL
# 提取列名 和 列值
t = o.get_table(input_tb)

arr_select = []
arr_join = []
for col in t.schema.names :
    arr_select.append('tmp_{col}.id as {col}'.format(col=col))
    arr_join.append("left join (select col_value,id from {id_tb} where ds = '{ds}' and col_name = '{col}') tmp_{col} on {input_tb}.{col} = tmp_{col}.col_value".format(id_tb=id_tb,col=col,ds=ds,input_tb=input_tb))

final_sql = " INSERT OVERWRITE TABLE {output_tb} PARTITION (ds = '{ds}') select ".format(output_tb=output_tb,ds=ds) + ','.join(arr_select) + " from {input_tb} ".format(input_tb=input_tb) + ' '.join(arr_join) + " where ds = '{ds}' ".format(ds=ds)

# (3) 写入数据
print(final_sql)
o.execute_sql(final_sql)

3.5 图输入点边关系表(用于图计算的结构)

from odps import ODPS
# (1) 参数：
# 表名
input_tb = 'dwd_portrait_oneid_user_encode_d_a'
output_tb = 'dwd_portrait_oneid_graph_input_d_a'
# CREATE TABLE IF NOT EXISTS yxp.dwd_portrait_oneid_graph_input_d_a
# (
#     v1  BIGINT COMMENT '顶点1'
#     ,v2 BIGINT COMMENT '顶点2'
# )
# COMMENT '用户画像关系表(编码)'
# PARTITIONED BY 
# (
#     ds  STRING COMMENT '分区字段'
# )
# ;

# 分区名
ds = args['bizdate']

# (2) 组装SQL
# 提取列名 和 列值
t = o.get_table(input_tb)
col_arr = []
for col in t.schema.names :
    col_arr.append(col)

mapping_arr = []
index = 0
while index < len(col_arr) :
    right = index + 1
    while right < len(col_arr) :
        mapping_arr.append(" select {col1} AS v1,{col2} AS v2 from {input_tb} where ds = '{ds}' and {col1} is not null and {col2} is not null ".format(col1=col_arr[index],col2=col_arr[right],input_tb=input_tb,ds=ds))
        right += 1
    index += 1

final_sql = "INSERT OVERWRITE TABLE {output_tb} PARTITION (ds = '{ds}') ".format(output_tb=output_tb,ds=ds) + ' union all '.join(mapping_arr)

# (3) 写入数据
print(final_sql)
o.execute_sql(final_sql)

3.6 图计算输出

3.7映射oneid码表(将每个字段绑定最小oneid)

CREATE TABLE IF NOT EXISTS yxp.dwd_portrait_oneid_user_col_relation_minid_d_a
(
    id         BIGINT COMMENT '映射id'
    ,col_name  STRING COMMENT '字段名'
    ,col_value STRING COMMENT '字段值'
    ,minid     BIGINT COMMENT 'minid'
    ,oneid     BIGINT COMMENT 'oneid'
)
COMMENT '用户画像id映射输出表'
PARTITIONED BY 
(
    ds         STRING COMMENT '分区字段'
)
;

INSERT OVERWRITE TABLE yxp.dwd_portrait_oneid_user_col_relation_minid_d_a PARTITION (ds = '${bizdate}')
SELECT  t1.id
        ,t1.col_name
        ,t1.col_value
        ,t2.minid
        ,COALESCE(t2.minid,t1.id) AS oneid -- 没有边关系时最小oneid 既是自己
FROM    dwd_portrait_oneid_user_col_relation_d_a t1
LEFT JOIN dwd_portrait_oneid_graph_output_d_a t2
ON      t1.id = t2.v
AND     t2.ds = '${bizdate}'
WHERE   t1.ds = '${bizdate}'
;

3.8大宽表(解码，将oneid字段回填大宽表最终输出)

from odps import ODPS
# (1) 参数：
# 表名
input_tb = 'dwd_portrait_oneid_user_d_a'
id_tb = 'dwd_portrait_oneid_user_col_relation_minid_d_a'
output_tb = 'dwd_portrait_oneid_user_decode_d_a'
# 输出表比原始表多一个字段要保持一致
# CREATE TABLE IF NOT EXISTS yxp.dwd_portrait_oneid_user_decode_d_a
# (
#     oneid       BIGINT COMMENT '唯一id'
#     ,union_id   STRING COMMENT 'union_id'
#     ,phone      STRING COMMENT '电话号码'
#     ,parent_id  STRING COMMENT '家长账号id'
#     ,student_id STRING COMMENT '学生账号id'
#     ,machine_no STRING COMMENT '内码'
#     ,flowcode   STRING COMMENT '外码'
# )
# COMMENT '用户画像基础明细表(解码)'
# PARTITIONED BY 
# (
#     ds          STRING COMMENT '分区字段'
# )
# ;


# 分区名
ds = args['bizdate']

# (2) 组装SQL
# 提取列名 和 列值
t = o.get_table(input_tb)

arr_select_oneid = []
arr_select = []
arr_join = []
for col in t.schema.names :
    arr_select_oneid.append('tmp_{col}.oneid'.format(col=col))
    arr_select.append("{input_tb}.{col}".format(input_tb=input_tb,col=col))
    arr_join.append("left join (select col_value,oneid from {id_tb} where ds = '{ds}' and col_name = '{col}') tmp_{col} on {input_tb}.{col} = tmp_{col}.col_value".format(id_tb=id_tb,col=col,ds=ds,input_tb=input_tb))

final_sql = " INSERT OVERWRITE TABLE {output_tb} PARTITION (ds = '{ds}') select coalesce({sql_oneid}) as oneid,{sql_col} from {input_tb} {sql_join} where ds = '{ds}' " \
            .format(sql_oneid=','.join(arr_select_oneid) \
                , sql_col=','.join(arr_select) \
                ,sql_join=' '.join(arr_join) \
                ,output_tb=output_tb ,input_tb=input_tb ,ds=ds )

# (3) 写入数据
print(final_sql)
o.execute_sql(final_sql)

3.9 结果表(可通过oneid与字段值互查)

CREATE TABLE IF NOT EXISTS yxp.dwd_portrait_oneid_user_total_d_a
(
    oneid       BIGINT COMMENT '唯一id'
    ,union_id   ARRAY<STRING> COMMENT 'union_id'
    ,phone      ARRAY<STRING> COMMENT '电话号码'
    ,parent_id  ARRAY<STRING> COMMENT '家长账号id'
    ,student_id ARRAY<STRING> COMMENT '学生账号id'
    ,machine_no ARRAY<STRING> COMMENT '内码'
    ,flowcode   ARRAY<STRING> COMMENT '外码'
)
COMMENT '用户画像基础结果表'
PARTITIONED BY 
(
    ds          STRING COMMENT '分区字段'
)
;

-- EXPLAIN 
INSERT OVERWRITE TABLE yxp.dwd_portrait_oneid_user_total_d_a PARTITION (ds = '${bizdate}')
SELECT  oneid
        ,COLLECT_SET(union_id) AS union_id
        ,COLLECT_SET(phone) AS phone
        ,COLLECT_SET(parent_id) AS parent_id
        ,COLLECT_SET(student_id) AS student_id
        ,COLLECT_SET(machine_no) AS machine_no
        ,COLLECT_SET(flowcode) AS flowcode
FROM    dwd_portrait_oneid_user_decode_d_a
WHERE   ds = '${bizdate}'
GROUP BY oneid
;