推荐系统笔记-03-用户画像

最新推荐文章于 2022-12-10 14:47:41 发布

人生偌只如初见

最新推荐文章于 2022-12-10 14:47:41 发布

阅读量305

点赞数

分类专栏： DataMining Python 文章标签：推荐用户画像 python

本文链接：https://blog.csdn.net/fighting_one_piece/article/details/103755459

版权

DataMining 同时被 2 个专栏收录

49 篇文章 1 订阅

订阅专栏

Python

33 篇文章 0 订阅

订阅专栏

1、相关环境

hadoop-2.10.0

hive-3.1.2

hbase-2.2.2

spark-2.4.4

2、相关表结构

Hive

CREATE TABLE T_USER_OP_LOG(
USER_ID BIGINT,
ARTICLE_ID STRING,
CHANNEL_ID INT,
OP_TYPE INT COMMENT '1 display 2 click 3 collect 4 share',
OP_TIME STRING,
OP_DURATION INT,
ALGO INT
)
COMMENT 'user operation log table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/user/hive/warehouse/portal/user_op_log';

CREATE TABLE T_USER_BEHAVIOUR(
USER_ID BIGINT,
ARTICLE_ID STRING,
CHANNEL_ID INT,
DISPLAY_FLAG INT,
CLICK_FLAG INT,
COLLECT_FLAG INT,
SHARE_FLAG INT,
OP_TIME STRING,
OP_DURATION INT
)
COMMENT 'user behaviour table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/user/hive/warehouse/portal/user_behaviour';

HBase

create 'user_profile', 'b', 'p'

3、相关Python实现

# -*- coding:utf-8 -*-

import os
import sys
import numpy as np
from datetime import datetime
BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, os.path.join(BASE_PATH))
print sys.path
from offline import BaseSparkSession

default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)

os.environ['PYSPARK_PYTHON'] = 'F:\develop\python\Python27\python.exe'
os.environ['HADOOP_HOME'] = 'F:\develop\hadoop\hadoop-2.10.0'
os.environ['HADOOP_CONF_DIR'] = 'F:\develop\hadoop\hadoop-2.10.0-conf'
os.environ['SPARK_HOME'] = 'F:\develop\spark\spark-2.4.4-bin-hadoop2.7'


class UserProfileGenerator(BaseSparkSession):

    def __init__(self):
        self.SPARK_APP_NAME = 'user_profile_generator'
        self.SPARK_MASTER_URL = 'yarn'
        self.SPARK_YARN_QUEUE = 'queue3'
        self.ENABLE_HIVE_SUPPORT = True
        self.spark_session = self.create_spark_session()

    # 生成用户行为数据
    def gen_user_behaviour(self):
        self.spark_session.sql("use portal")
        user_op_log_df = self.spark_session.sql("select * from t_user_op_log")

        def convert_to_behaviour(partition):
            for row in partition:
                if row.op_type == 1:
                    yield row.user_id, row.article_id, row.channel_id, 1, 0, 0, 0, row.op_time, row.op_duration
                elif row.op_type == 2:
                    yield row.user_id, row.article_id, row.channel_id, 1, 1, 0, 0, row.op_time, row.op_duration
                elif row.op_type == 3:
                    yield row.user_id, row.article_id, row.channel_id, 1, 1, 1, 0, row.op_time, row.op_duration
                elif row.op_type == 4:
                    yield row.user_id, row.article_id, row.channel_id, 1, 1, 0, 1, row.op_time, row.op_duration
        user_behaviour_df = user_op_log_df.rdd.mapPartitions(convert_to_behaviour) \
            .toDF(["user_id", "article_id", "channel_id", "display_flag", "click_flag",
                   "collect_flag", "share_flag", "op_time", "op_duration"])
        user_behaviour_df.show()

        # 合并用户文章行为
        user_behaviour_df.registerTempTable("tmp_user_behaviour")
        sql = "select user_id, article_id, min(channel_id) channel_id, max(display_flag) display_flag, " \
              "max(click_flag) click_flag, max(collect_flag) collect_flag, max(share_flag) share_flag," \
              "max(op_time) op_time, max(op_duration) op_duration from tmp_user_behaviour group by user_id, article_id"
        user_behaviour_df = self.spark_session.sql(sql)
        user_behaviour_df.show()
        user_behaviour_df.write.insertInto("t_user_behaviour")

    # 生成用户基本画像
    def gen_user_basic_profile(self):
        self.spark_session.sql("use portal")
        user_df = self.spark_session.sql("select * from t_user")
        user_df.show()

        def insert_user_profile(partition):

            sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
            from recoutils.hbase_utils import HBaseUtils
            hbase_utils = HBaseUtils(host="192.168.0.1", port=9090, size=5)

            for row in partition:
                basic_info = {
                    "b:name": row.name,
                    "b:gender": str(row.gender),
                    "b:age": str(row.age),
                    "b:mobile_phone": row.mobile_phone,
                    "b:email": row.email
                }
                hbase_utils.insert("user_profile", "u:{}".format(row.id).encode(), basic_info)

        user_df.foreachPartition(insert_user_profile)

    # 生成用户偏爱喜好画像
    def gen_user_preference_profile(self):
        self.spark_session.sql("use portal")
        # 关联用户文章主题词
        sql = "select ub.*, ap.topics from t_user_behaviour ub " \
              "left join t_article_profile ap on ub.article_id = ap.article_id " \
              "where (ub.click_flag != 0 or ub.collect_flag != 0 or ub.share_flag != 0) and ap.topics is not null"
        user_behaviour_df = self.spark_session.sql(sql)
        user_behaviour_df.show()

        import pyspark.sql.functions as F
        user_behaviour_df = user_behaviour_df.withColumn("topic", F.explode("topics")).drop("topics")
        user_behaviour_df.show()

        def insert_user_profile(partition):

            sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
            from recoutils.hbase_utils import HBaseUtils
            hbase_utils = HBaseUtils(host="192.168.0.1", port=9090, size=5)

            for row in partition:
                # 计算频道主题标签权值 时间衰减系数 * 行为权重和
                t = datetime.now() - datetime.strptime(row.op_time, '%Y-%m-%d %H:%M:%S')
                time_exp = 1 / (np.log(t.days + 1) + 1)
                duration_weight = 2 if row.op_duration > 1000 else 1
                weight = time_exp * (row.click_flag * 5 + row.collect_flag * 3 + row.share_flag * 4 + duration_weight)

                hbase_utils.insert("user_profile", "u:{}".format(row.user_id).encode(),
                    {"p:{}:{}".format(row.channel_id, row.topic).encode(): b'%0.4f' % weight})

        user_behaviour_df.foreachPartition(insert_user_profile)


if __name__ == '__main__':
    user_profile_generator = UserProfileGenerator()
    user_profile_generator.gen_user_behaviour()
    user_profile_generator.gen_user_basic_profile()
    user_profile_generator.gen_user_preference_profile()