实时产生推荐结果

日萌社

人工智能AI:Keras PyTorch MXNet TensorFlow PaddlePaddle 深度学习实战(不定时更新)


六 实时产生推荐结果

6.1 推荐任务处理

  • CTR预测模型 + 特征 ==> 预测结果 ==> TOP-N列表
  • 特征获取
import redis
import json
import pandas as pd
from pyspark.ml.linalg import DenseVector


def create_datasets(userId, pid):
    client_of_recall = redis.StrictRedis(host="192.168.19.137", port=6379, db=9)
    client_of_features = redis.StrictRedis(host="192.168.19.137", port=6379, db=10)
    # 获取用户特征
    user_feature = json.loads(client_of_features.hget("user_features", userId))

    # 获取用户召回集
    recall_sets = client_of_recall.smembers(userId)

    result = []

    # 遍历召回集
    for adgroupId in recall_sets:
        adgroupId = int(adgroupId)
        # 获取该广告的特征值
        ad_feature = json.loads(client_of_features.hget("ad_features", adgroupId))

        features = {}
        features.update(user_feature)
        features.update(ad_feature)

        for k,v in features.items():
            if v is None:
                features[k] = -1

        features_col = [
            # 特征值
            "price",
            "cms_group_id",
            "final_gender_code",
            "age_level",
            "shopping_level",
            "occupation",
            "pid", 
            "pvalue_level",
            "new_user_class_level"
        ]
        '''
        "cms_group_id", 类别型特征,约13个分类 ==> 13维
        "final_gender_code", 类别型特征,2个分类 ==> 2维
        "age_level", 类别型特征,7个分类 ==>7维
        "shopping_level", 类别型特征,3个分类 ==> 3维
        "occupation", 类别型特征,2个分类 ==> 2维
        '''

        price = float(features["price"])

        pid_value = [0 for i in range(2)]#[0,0]
        cms_group_id_value = [0 for i in range(13)]
        final_gender_code_value = [0 for i in range(2)]
        age_level_value = [0 for i in range(7)]
        shopping_level_value = [0 for i in range(3)]
        occupation_value = [0 for i in range(2)]
        pvalue_level_value = [0 for i in range(4)]
        new_user_class_level_value = [0 for i in range(5)]

        pid_value[pid_rela[pid]] = 1
        cms_group_id_value[cms_group_id_rela[int(features["cms_group_id"])]] = 1
        final_gender_code_value[final_gender_code_rela[int(features["final_gender_code"])]] = 1
        age_level_value[age_level_rela[int(features["age_level"])]] = 1
        shopping_level_value[shopping_level_rela[int(features["shopping_level"])]] = 1
        occupation_value[occupation_rela[int(features["occupation"])]] = 1
        pvalue_level_value[pvalue_level_rela[int(features["pvalue_level"])]] = 1
        new_user_class_level_value[new_user_class_level_rela[int(features["new_user_class_level"])]] = 1
 #         print(pid_value)
#         print(cms_group_id_value)
#         print(final_gender_code_value)
#         print(age_level_value)
#         print(shopping_level_value)
#         print(occupation_value)
#         print(pvalue_level_value)
#         print(new_user_class_level_value)

        vector = DenseVector([price] + pid_value + cms_group_id_value + final_gender_code_value\
        + age_level_value + shopping_level_value + occupation_value + pvalue_level_value + new_user_class_level_value)

        result.append((userId, adgroupId, vector))

    return result

# create_datasets(88, "430548_1007")
  • 载入训练好的模型
from pyspark.ml.classification import LogisticRegressionModel
CTR_model = LogisticRegressionModel.load("hdfs://localhost:9000/models/CTRModel_AllOneHot.obj")
pdf = pd.DataFrame(create_datasets(8, "430548_1007"), columns=["userId", "adgroupId", "features"])
datasets = spark.createDataFrame(pdf)
datasets.show()

显示结果:

+------+---------+--------------------+
|userId|adgroupId|            features|
+------+---------+--------------------+
|     8|   445914|[9.89999961853027...|
|     8|   258252|[7.59999990463256...|
|     8|   129682|[8.5,1.0,0.0,1.0,...|
|     8|   763027|[68.0,1.0,0.0,1.0...|
|     8|   292027|[16.0,1.0,0.0,1.0...|
|     8|   430023|[34.2000007629394...|
|     8|   133457|[169.0,1.0,0.0,1....|
|     8|   816999|[5.0,1.0,0.0,1.0,...|
|     8|   221714|[4.80000019073486...|
|     8|   186334|[106.0,1.0,0.0,1....|
|     8|   169717|[2.20000004768371...|
|     8|    31314|[15.8000001907348...|
|     8|   815312|[2.29999995231628...|
|     8|   199445|[5.0,1.0,0.0,1.0,...|
|     8|   746178|[16.7999992370605...|
|     8|   290950|[6.5,1.0,0.0,1.0,...|
|     8|   221585|[18.5,1.0,0.0,1.0...|
|     8|   692672|[47.0,1.0,0.0,1.0...|
|     8|   797982|[33.0,1.0,0.0,1.0...|
|     8|   815219|[2.40000009536743...|
+------+---------+--------------------+
only showing top 20 rows
prediction = CTR_model.transform(datasets).sort("probability")
prediction.show()
+------+---------+--------------------+--------------------+--------------------+----------+
|userId|adgroupId|            features|       rawPrediction|         probability|prediction|
+------+---------+--------------------+--------------------+--------------------+----------+
|     8|   631204|[19888.0,1.0,0.0,...|[2.69001234046578...|[0.93643471623189...|       0.0|
|     8|   583215|[3750.0,1.0,0.0,1...|[2.69016170680037...|[0.93644360664433...|       0.0|
|     8|   275819|[3280.0,1.0,0.0,1...|[2.69016605691669...|[0.93644386554961...|       0.0|
|     8|   401433|[1200.0,1.0,0.0,1...|[2.69018530849532...|[0.93644501133142...|       0.0|
|     8|    29466|[640.0,1.0,0.0,1....|[2.69019049161265...|[0.93644531980785...|       0.0|
|     8|   173327|[356.0,1.0,0.0,1....|[2.69019312019358...|[0.93644547624893...|       0.0|
|     8|   241402|[269.0,1.0,0.0,1....|[2.69019392542787...|[0.93644552417271...|       0.0|
|     8|   351366|[246.0,1.0,0.0,1....|[2.69019413830591...|[0.93644553684221...|       0.0|
|     8|   229827|[238.0,1.0,0.0,1....|[2.69019421235044...|[0.93644554124900...|       0.0|
|     8|   164807|[228.0,1.0,0.0,1....|[2.69019430490611...|[0.93644554675747...|       0.0|
|     8|   227731|[199.0,1.0,0.0,1....|[2.69019457331754...|[0.93644556273205...|       0.0|
|     8|   265403|[198.0,1.0,0.0,1....|[2.69019458257311...|[0.93644556328290...|       0.0|
|     8|   569939|[188.0,1.0,0.0,1....|[2.69019467512877...|[0.93644556879138...|       0.0|
|     8|   277335|[181.5,1.0,0.0,1....|[2.69019473528996...|[0.93644557237189...|       0.0|
|     8|   575633|[180.0,1.0,0.0,1....|[2.69019474917331...|[0.93644557319816...|       0.0|
|     8|   201867|[179.0,1.0,0.0,1....|[2.69019475842887...|[0.93644557374900...|       0.0|
|     8|    25542|[176.0,1.0,0.0,1....|[2.69019478619557...|[0.93644557540155...|       0.0|
|     8|   133457|[169.0,1.0,0.0,1....|[2.69019485098454...|[0.93644557925748...|       0.0|
|     8|   494224|[169.0,1.0,0.0,1....|[2.69019485098454...|[0.93644557925748...|       0.0|
|     8|   339382|[163.0,1.0,0.0,1....|[2.69019490651794...|[0.93644558256256...|       0.0|
+------+---------+--------------------+--------------------+--------------------+----------+
only showing top 20 rows
  • TOP-20
# TOP-20
prediction.select("adgroupId").head(20)

显示结果:

[Row(adgroupId=631204),
 Row(adgroupId=583215),
 Row(adgroupId=275819),
 Row(adgroupId=401433),
 Row(adgroupId=29466),
 Row(adgroupId=173327),
 Row(adgroupId=241402),
 Row(adgroupId=351366),
 Row(adgroupId=229827),
 Row(adgroupId=164807),
 Row(adgroupId=227731),
 Row(adgroupId=265403),
 Row(adgroupId=569939),
 Row(adgroupId=277335),
 Row(adgroupId=575633),
 Row(adgroupId=201867),
 Row(adgroupId=25542),
 Row(adgroupId=133457),
 Row(adgroupId=494224),
 Row(adgroupId=339382)]
[i.adgroupId for i in prediction.select("adgroupId").head(20)]

显示结果:

[631204,
 583215,
 275819,
 401433,
 29466,
 173327,
 241402,
 351366,
 229827,
 164807,
 227731,
 265403,
 569939,
 277335,
 575633,
 201867,
 25542,
 133457,
 494224,
 339382]

# 推荐任务处理:CTR预测模型 + 特征 ==> 预测结果 ==> TOP-N列表
# 特征获取
import redis
import json
import pandas as pd
from pyspark.ml.linalg import DenseVector

# userId(用户ID)、pid(广告位ID)
def create_datasets(userId, pid):
    client_of_recall = redis.StrictRedis(host="192.168.19.137", port=6379, db=9)
    client_of_features = redis.StrictRedis(host="192.168.19.137", port=6379, db=10)
    # 获取用户特征:"user_features"为大key,userId(用户ID)为小key,获取出来的是value(用户特征值feature_cols_from_user)
    user_feature = json.loads(client_of_features.hget("user_features", userId))

    # 获取该用户userId对应的召回集(500个adgroupId(商品ID))。之前是通过client.sadd(userId, *ret)把userId对应的500个adgroupId(商品ID)存储到redis。
    recall_sets = client_of_recall.smembers(userId)
    result = []
    # 遍历用户的召回集:每个adgroupId(商品ID)
    for adgroupId in recall_sets:
        adgroupId = int(adgroupId)
        # 获取该广告的特征值。"ad_features"为大key,adgroupId(商品ID)为小key,获取出的value为商品价格price
        ad_feature = json.loads(client_of_features.hget("ad_features", adgroupId))

        features = {}
        features.update(user_feature)
        features.update(ad_feature)

        for k,v in features.items():
            if v is None:
                features[k] = -1 #把空值替换为-1

        features_col = [
            # 特征值
            "price",
            "cms_group_id",
            "final_gender_code",
            "age_level",
            "shopping_level",
            "occupation",
            "pid", 
            "pvalue_level",
            "new_user_class_level"
        ]
        '''
        "cms_group_id", 类别型特征,约13个分类 ==> 13维
        "final_gender_code", 类别型特征,2个分类 ==> 2维
        "age_level", 类别型特征,7个分类 ==>7维
        "shopping_level", 类别型特征,3个分类 ==> 3维
        "occupation", 类别型特征,2个分类 ==> 2维
        '''

	#第一种one-hot化写法:Pipeline(stages=[StringIndexer, OneHotEncoder])
	#	该写法特点:每次使用StringIndexer进行转换时每个类型的值变成是哪个索引值是随机的
	#	stringindexer = StringIndexer(inputCol='pid', outputCol='pid_feature')
	#	encoder = OneHotEncoder(dropLast=False, inputCol='pid_feature', outputCol='pid_value')
	#	pipeline = Pipeline(stages=[stringindexer, encoder])
	#	pipeline_model = pipeline.fit(raw_sample_df)
	#	new_df = pipeline_model.transform(raw_sample_df)
	#	new_df.show()
	#第二种one-hot化写法:
	#	pid_value = [0 for i in range(2)] # pid有两种类型的值,因此先初始化one-hot向量为2个0,-1也是其中一种类型的值,-1代表空值。
	#	pid_value[pid_rela[pid]] = 1 # pid_rela如图所示,根据pid实际值最终给初始化one-hot向量中对应的索引位置上设为1。

        price = float(features["price"])
        pid_value = [0 for i in range(2)] #[0,0]
        cms_group_id_value = [0 for i in range(13)]
        final_gender_code_value = [0 for i in range(2)]
        age_level_value = [0 for i in range(7)]
        shopping_level_value = [0 for i in range(3)]
        occupation_value = [0 for i in range(2)]
        pvalue_level_value = [0 for i in range(4)]
        new_user_class_level_value = [0 for i in range(5)]

        pid_value[pid_rela[pid]] = 1
        cms_group_id_value[cms_group_id_rela[int(features["cms_group_id"])]] = 1
        final_gender_code_value[final_gender_code_rela[int(features["final_gender_code"])]] = 1
        age_level_value[age_level_rela[int(features["age_level"])]] = 1
        shopping_level_value[shopping_level_rela[int(features["shopping_level"])]] = 1
        occupation_value[occupation_rela[int(features["occupation"])]] = 1
        pvalue_level_value[pvalue_level_rela[int(features["pvalue_level"])]] = 1
        new_user_class_level_value[new_user_class_level_rela[int(features["new_user_class_level"])]] = 1
#         print(pid_value)
#         print(cms_group_id_value)
#         print(final_gender_code_value)
#         print(age_level_value)
#         print(shopping_level_value)
#         print(occupation_value)
#         print(pvalue_level_value)
#         print(new_user_class_level_value)

        # DenseVector稠密向量(矩阵):每个变量值之间使用逗号拼接在一起。比如[18.5,1.0,0.0,1.0... 代表price为[18.5],pid_value为[1.0,0.0]等等
        vector = DenseVector([price] + pid_value + cms_group_id_value + final_gender_code_value\
        + age_level_value + shopping_level_value + occupation_value + pvalue_level_value + new_user_class_level_value)

        result.append((userId, adgroupId, vector))

    return result

# create_datasets(88, "430548_1007")
# 载入训练好的模型
from pyspark.ml.classification import LogisticRegressionModel
CTR_model = LogisticRegressionModel.load("hdfs://localhost:9000/models/CTRModel_AllOneHot.obj")
# userId(用户ID)为8,pid(广告位ID)为"430548_1007"。
# ["userId", "adgroupId", "features"] 对应的是 result中的 (userId, adgroupId, vector)
pdf = pd.DataFrame(create_datasets(8, "430548_1007"), columns=["userId", "adgroupId", "features"])
datasets = spark.createDataFrame(pdf)
datasets.show()

显示结果:
+------+---------+--------------------+
|userId|adgroupId|            features|
+------+---------+--------------------+
|     8|   445914|[9.89999961853027...|
|     8|   258252|[7.59999990463256...|
|     8|   129682|[8.5,1.0,0.0,1.0,...|
|     8|   763027|[68.0,1.0,0.0,1.0...|
|     8|   292027|[16.0,1.0,0.0,1.0...|
|     8|   430023|[34.2000007629394...|
|     8|   133457|[169.0,1.0,0.0,1....|
|     8|   816999|[5.0,1.0,0.0,1.0,...|
|     8|   221714|[4.80000019073486...|
|     8|   186334|[106.0,1.0,0.0,1....|
|     8|   169717|[2.20000004768371...|
|     8|    31314|[15.8000001907348...|
|     8|   815312|[2.29999995231628...|
|     8|   199445|[5.0,1.0,0.0,1.0,...|
|     8|   746178|[16.7999992370605...|
|     8|   290950|[6.5,1.0,0.0,1.0,...|
|     8|   221585|[18.5,1.0,0.0,1.0...|
|     8|   692672|[47.0,1.0,0.0,1.0...|
|     8|   797982|[33.0,1.0,0.0,1.0...|
|     8|   815219|[2.40000009536743...|
+------+---------+--------------------+
only showing top 20 rows

# 概率"probability",预测值"prediction"(0代表不被点击,1代表被点击)。
# sort("probability"):按概率"probability"升序排列数据,"probability"表示预测结果的概率。
# 模型预测:不被点击的概率越低也就是被点击的概率越高
prediction = CTR_model.transform(datasets).sort("probability")
prediction.show()

显示结果:
+------+---------+--------------------+--------------------+--------------------+----------+
|userId|adgroupId|            features|       rawPrediction|         probability|prediction|
+------+---------+--------------------+--------------------+--------------------+----------+
|     8|   631204|[19888.0,1.0,0.0,...|[2.69001234046578...|[0.93643471623189...|       0.0|
|     8|   583215|[3750.0,1.0,0.0,1...|[2.69016170680037...|[0.93644360664433...|       0.0|
|     8|   275819|[3280.0,1.0,0.0,1...|[2.69016605691669...|[0.93644386554961...|       0.0|
|     8|   401433|[1200.0,1.0,0.0,1...|[2.69018530849532...|[0.93644501133142...|       0.0|
|     8|    29466|[640.0,1.0,0.0,1....|[2.69019049161265...|[0.93644531980785...|       0.0|
|     8|   173327|[356.0,1.0,0.0,1....|[2.69019312019358...|[0.93644547624893...|       0.0|
|     8|   241402|[269.0,1.0,0.0,1....|[2.69019392542787...|[0.93644552417271...|       0.0|
|     8|   351366|[246.0,1.0,0.0,1....|[2.69019413830591...|[0.93644553684221...|       0.0|
|     8|   229827|[238.0,1.0,0.0,1....|[2.69019421235044...|[0.93644554124900...|       0.0|
|     8|   164807|[228.0,1.0,0.0,1....|[2.69019430490611...|[0.93644554675747...|       0.0|
|     8|   227731|[199.0,1.0,0.0,1....|[2.69019457331754...|[0.93644556273205...|       0.0|
|     8|   265403|[198.0,1.0,0.0,1....|[2.69019458257311...|[0.93644556328290...|       0.0|
|     8|   569939|[188.0,1.0,0.0,1....|[2.69019467512877...|[0.93644556879138...|       0.0|
|     8|   277335|[181.5,1.0,0.0,1....|[2.69019473528996...|[0.93644557237189...|       0.0|
|     8|   575633|[180.0,1.0,0.0,1....|[2.69019474917331...|[0.93644557319816...|       0.0|
|     8|   201867|[179.0,1.0,0.0,1....|[2.69019475842887...|[0.93644557374900...|       0.0|
|     8|    25542|[176.0,1.0,0.0,1....|[2.69019478619557...|[0.93644557540155...|       0.0|
|     8|   133457|[169.0,1.0,0.0,1....|[2.69019485098454...|[0.93644557925748...|       0.0|
|     8|   494224|[169.0,1.0,0.0,1....|[2.69019485098454...|[0.93644557925748...|       0.0|
|     8|   339382|[163.0,1.0,0.0,1....|[2.69019490651794...|[0.93644558256256...|       0.0|
+------+---------+--------------------+--------------------+--------------------+----------+
only showing top 20 rows


# TOP-20
prediction.select("adgroupId").head(20)

显示结果:
[Row(adgroupId=631204),
 Row(adgroupId=583215),
 Row(adgroupId=275819),
 Row(adgroupId=401433),
 Row(adgroupId=29466),
 Row(adgroupId=173327),
 Row(adgroupId=241402),
 Row(adgroupId=351366),
 Row(adgroupId=229827),
 Row(adgroupId=164807),
 Row(adgroupId=227731),
 Row(adgroupId=265403),
 Row(adgroupId=569939),
 Row(adgroupId=277335),
 Row(adgroupId=575633),
 Row(adgroupId=201867),
 Row(adgroupId=25542),
 Row(adgroupId=133457),
 Row(adgroupId=494224),
 Row(adgroupId=339382)]


[i.adgroupId for i in prediction.select("adgroupId").head(20)]

显示结果:
[631204,
 583215,
 275819,
 401433,
 29466,
 173327,
 241402,
 351366,
 229827,
 164807,
 227731,
 265403,
 569939,
 277335,
 575633,
 201867,
 25542,
 133457,
 494224,
 339382]

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

あずにゃん

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值