推荐系统笔记-07-离线特征数据集

最新推荐文章于 2023-10-23 20:46:27 发布

人生偌只如初见

最新推荐文章于 2023-10-23 20:46:27 发布

阅读量414

点赞数

分类专栏： DataMining Python 文章标签：推荐特征 Python

本文链接：https://blog.csdn.net/fighting_one_piece/article/details/103891146

版权

DataMining 同时被 2 个专栏收录

49 篇文章 1 订阅

订阅专栏

Python

33 篇文章 0 订阅

订阅专栏

1、相关环境

hadoop-2.10.0

hive-3.1.2

hbase-2.2.2

spark-2.4.4

2、相关表结构

HBase

create 'ctr_user_features', 'cf'

create 'ctr_article_features', 'cf'

Hive

CREATE EXTERNAL TABLE T_CTR_USER_FEATURES(
USER_ID STRING,
CF MAP<STRING, STRING>
)
COMMENT 'CTR USER FEATURES'
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf:")
TBLPROPERTIES ("hbase.table.name" = "ctr_user_features");

CREATE EXTERNAL TABLE T_CTR_ARTICLE_FEATURES(
ARTICLE_ID STRING,
CF MAP<STRING, STRING>
)
COMMENT 'CTR ARTICLE FEATURES'
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf:")
TBLPROPERTIES ("hbase.table.name" = "ctr_article_features");

3、相关Python实现

# -*- coding:utf-8 -*-

import os
import sys
import numpy as np
from datetime import datetime
BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, os.path.join(BASE_PATH))
print sys.path
from offline import BaseSparkSession

default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)

os.environ['PYSPARK_PYTHON'] = 'F:\develop\python\Python27\python.exe'
os.environ['HADOOP_HOME'] = 'F:\develop\hadoop\hadoop-2.10.0'
os.environ['HADOOP_CONF_DIR'] = 'F:\develop\hadoop\hadoop-2.10.0-conf'
os.environ['SPARK_HOME'] = 'F:\develop\spark\spark-2.4.4-bin-hadoop2.7'


class UserArticleFeaturesGenerator(BaseSparkSession):

    def __init__(self):
        self.SPARK_APP_NAME = 'user_article_features_generator'
        self.SPARK_MASTER_URL = 'yarn'
        self.SPARK_YARN_QUEUE = 'queue3'
        self.ENABLE_HIVE_SUPPORT = True
        self.spark_session = self.create_spark_session()

    # 生成用户特征
    def gen_user_features(self):
        self.spark_session.sql("use portal")
        # 获取用户画像 基础数据、偏好喜好数据
        sql = "select split(user_id, ':')[1] user_id, basic_info.gender, basic_info.age, preference_info " \
              "from t_user_profile"
        user_profile_df = self.spark_session.sql(sql)
        user_profile_df.show()

        # 抽取文章所属频道关键词向量特征
        def extract_channel_keyword_feature(partition):
            from pyspark.ml.linalg import Vectors

            channel_id_list = [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008]

            for row in partition:
                for channel_id in channel_id_list:
                    try:
                        weights = sorted([row.preference_info[key] for key in row.preference_info.keys()
                                          if key.split(':')[0] == str(channel_id)], reverse=True)[:10]
                    except Exception as e:
                        print e.message
                        weights = [0.0] * 10
                    yield row.user_id, int(row.gender), int(row.age), channel_id, \
                        Vectors.dense(weights if weights else [0.0] * 10)

        user_profile_df = user_profile_df.rdd.mapPartitions(extract_channel_keyword_feature). \
            toDF(["user_id", "gender", "age", "channel_id", "channel_weights"])

        # 收集特征
        from pyspark.ml.feature import VectorAssembler
        user_profile_df = VectorAssembler().setInputCols(["gender", "age", "channel_weights"]) \
                                           .setOutputCol("features") \
                                           .transform(user_profile_df)

        # 存储用户特征
        def insert_ctr_user_features(partition):

            sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
            from recoutils.hbase_utils import HBaseUtils
            hbase_utils = HBaseUtils(host="192.168.0.1", port=9090, size=5)

            for row in partition:
                hbase_utils.insert("ctr_user_features", str(row.user_id).encode(),
                                   {"cf:{}".format(row.channel_id).encode(): str(row.features).encode()})

        user_profile_df.foreachPartition(insert_ctr_user_features)

    # 生成文章特征
    def gen_article_features(self):
        self.spark_session.sql("use portal")

        # 获取文章画像
        article_profile_df = self.spark_session.sql("select * from t_article_profile")

        # 抽取文章关键词向量特征
        def extract_feature(partition):
            from pyspark.ml.linalg import Vectors
            for row in partition:
                try:
                    weights = sorted(row.keywords.values(), reverse=True)[:10]
                except Exception as e:
                    print e.message
                    weights = [0.0] * 10
                yield row.article_id, row.channel_id, Vectors.dense(weights if weights else [0.0] * 10)

        article_profile_df = article_profile_df.rdd.mapPartitions(extract_feature) \
            .toDF(["article_id", "channel_id", "article_weights"])
        article_profile_df.show()

        # 获取文章向量
        article_vector_df = self.spark_session.sql("select article_id, vector from t_article_vector")

        def array_to_vector(partition):
            from pyspark.ml.linalg import Vectors
            for row in partition:
                yield row.article_id, Vectors.dense(row.vector)

        article_vector_df = article_vector_df.rdd.mapPartitions(array_to_vector).toDF(["article_id", "article_vector"])
        article_vector_df.show()

        article_profile_df = article_profile_df.join(article_vector_df, on=["article_id"], how="inner")

        # 收集特征
        from pyspark.ml.feature import VectorAssembler
        article_profile_df = VectorAssembler().setInputCols(["channel_id", "article_weights", "article_vector"]) \
                                              .setOutputCol("features") \
                                              .transform(article_profile_df)
        article_profile_df.show()

        # 存储文章特征
        def insert_ctr_article_features(partition):

            sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
            from recoutils.hbase_utils import HBaseUtils
            hbase_utils = HBaseUtils(host="192.168.0.1", port=9090, size=5)

            for row in partition:
                hbase_utils.insert("ctr_article_features", str(row.article_id).encode(),
                                   {"cf:{}".format(row.article_id).encode(): str(row.features).encode()})

        article_profile_df.foreachPartition(insert_ctr_article_features)


if __name__ == '__main__':
    user_article_features_generator = UserArticleFeaturesGenerator()
    user_article_features_generator.gen_user_features()
    user_article_features_generator.gen_article_features()

针对上一章节的CTR排序模型LR可以做一些优化处理

    def gen_lr_sort_model_optimize(self):
        self.spark_session.sql("use portal")

        # 获取用户特征数据
        ctr_user_features_df = self.spark_session.sql("select * from t_ctr_user_features")

        def extract_user_features(partition):
            for row in partition:
                for item in row.cf.items():
                    yield row.user_id, item[0], eval(item[1])

        ctr_user_features_df = ctr_user_features_df.rdd.mapPartitions(extract_user_features) \
            .toDF(["user_id", "channel_id", "user_features"])
        ctr_user_features_df.show()

        # 获取文章特征数据
        ctr_article_features_df = self.spark_session.sql("select * from t_ctr_article_features")

        def extract_article_features(partition):
            for row in partition:
                for item in row.cf.items():
                    yield row.article_id, eval(item[1])

        ctr_article_features_df = ctr_article_features_df.rdd.mapPartitions(extract_article_features) \
            .toDF(["article_id", "article_features"])
        ctr_article_features_df.show()

        # 用户文章点击行为
        sql = "select user_id, article_id, channel_id, click_flag from t_user_behaviour"
        user_article_click_df = self.spark_session.sql(sql)
        user_article_click_df.show()

        # 关联用户、文章特征
        user_article_click_df = user_article_click_df.join(ctr_user_features_df, ["user_id", "channel_id"], "left")
        user_article_click_df = user_article_click_df.join(ctr_article_features_df, ["article_id"], "left")
        input_cols = ["channel_id", "gender", "age", "channel_weights", "article_weights", "article_vector"]
        
        # 收集特征
        def assemble_featrues(partition):
            from pyspark.ml.linalg import Vectors
            for row in partition:
                features = []
                features.append(row.channel_id)
                features.extend(row.user_features)
                features.extend(row.article_features[1:] if row.article_features else [0.0] * 110)
                yield row.user_id, row.article_id, row.click_flag, Vectors.dense(features)

        user_article_click_df = user_article_click_df.rdd.mapPartitions(assemble_featrues) \
            .toDF(["user_id", "article_id", "click_flag", "features"])
        user_article_click_df.show()

        # Logistic Regression
        from pyspark.ml.classification import LogisticRegression
        logistic_regression = LogisticRegression()
        logistic_regression_model = logistic_regression.setFeaturesCol("features") \
            .setLabelCol("click_flag") \
            .fit(user_article_click_df)
        logistic_regression_model.write().overwrite().save(
            "hdfs://192.168.0.1:9000/user/models/logistic_regression/lr.model")

        from pyspark.ml.classification import LogisticRegressionModel
        logistic_regression_model = LogisticRegressionModel.load(
            "hdfs://192.168.0.1:9000/user/models/logistic_regression/lr.model")
        logistic_regression_result = logistic_regression_model.transform(user_article_click_df)
        logistic_regression_result.select(["click_flag", "probability", "prediction"]).show()