pyspark mllib ALS 实践

最新推荐文章于 2023-08-15 19:08:29 发布

Last_xuan1

最新推荐文章于 2023-08-15 19:08:29 发布

阅读量626

点赞数 1

分类专栏：推荐系统 # Spark ML # Spark

本文链接：https://blog.csdn.net/qq_43391383/article/details/104169502

版权

推荐系统同时被 3 个专栏收录

11 篇文章 1 订阅

订阅专栏

Spark

7 篇文章 0 订阅

订阅专栏

Spark ML

2 篇文章 0 订阅

订阅专栏

数据集 https://tianchi.aliyun.com/dataset/dataDetail?dataId=56

基于 ALS 的 LFM 算法进行候选集召回

# 从用户行为日志数据集behavior_log 7亿 条数据中选取 100万 条
import pandas as pd
import warnings

warnings.filterwarnings("ignore")
path = r'D:\阿里ctr预估数据集\behavior_log.csv\behavior_log.csv\behavior_log.csv' # 大概 22G
# 生成器, chunksize=1000 表示一次读 1000行，1000行为一个 chunk
reader = pd.read_csv(path, chunksize=1000,iterator=True)

num = 0
while(num < 1000):
    chunk = next(reader)
    if not num:
        chunk.to_csv('behavior_log_part.csv', index=False)
    if 0 < num < 1000:
        chunk.to_csv('behavior_log_part.csv', index=False,
                      mode='a+', header=False)
    num += 1
        
d = pd.read_csv('behavior_log_part.csv')
d

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType,\
                              IntegerType, LongType

spark = SparkSession.builder.appName('behavior_log').getOrCreate()
# 设置字段类型
behavior_log_schema = StructType([
    StructField('user', IntegerType()),
    StructField('time_stamp', LongType()),
    StructField('btag', StringType()),
    StructField('cate', IntegerType()),
    StructField('brand', IntegerType()),
])

behavior_log_df = spark.read.csv('behavior_log_part.csv',\
                                 header=True, schema=behavior_log_schema)
behavior_log_df.show();behavior_log_df.count()# 行数

+------+----------+----+-----+------+
|  user|time_stamp|btag| cate| brand|
+------+----------+----+-----+------+
|558157|1493741625|  pv| 6250| 91286|
|558157|1493741626|  pv| 6250| 91286|
|558157|1493741627|  pv| 6250| 91286|
|728690|1493776998|  pv|11800| 62353|
|332634|1493809895|  pv| 1101|365477|
|857237|1493816945|  pv| 1043|110616|
|619381|1493774638|  pv|  385|428950|
|467042|1493772641|  pv| 8237|301299|
|467042|1493772644|  pv| 8237|301299|
|991528|1493780710|  pv| 7270|274795|
|991528|1493780712|  pv| 7270|274795|
|991528|1493780712|  pv| 7270|274795|
|991528|1493780712|  pv| 7270|274795|
|991528|1493780714|  pv| 7270|274795|
|991528|1493780765|  pv| 7270|274795|
|991528|1493780714|  pv| 7270|274795|
|991528|1493780765|  pv| 7270|274795|
|991528|1493780764|  pv| 7270|274795|
|991528|1493780633|  pv| 7270|274795|
|991528|1493780764|  pv| 7270|274795|
+------+----------+----+-----+------+
only showing top 20 rows

del d

因为除了时间戳以外的都是离散值，没有太多处理的，查看一下各类别频数

# 查看各类别特征的频数
# 第一个 count()是聚合方法，第二个 count() 是 dataframe.count()，查看行数
unique_count = lambda group_obj: group_obj.count().count()

print('用户数', behavior_log_df.groupBy('user').count().count())
print('用户行为类别', behavior_log_df.groupBy('btag').count().count())
print('商品类目数', unique_count(behavior_log_df.groupBy('cate')))
print('品牌数', unique_count(behavior_log_df.groupBy('brand')))

用户数 242109
用户行为类别 4
商品类目数 6044
品牌数 49179

# 查看空值数量
behavior_log_df.count() - behavior_log_df.dropna().count()

# 获取每个用户的对每一类商品的每一种行为的频数，后面用于对每个行为类别进行线性加权，生成用户评分
# 透视表操作，透视表相当于两个分组 groupby_obj 分别作为行和列，生成一个表
cate_count_df = behavior_log_df.groupBy(behavior_log_df['user'], \
                                        behavior_log_df['cate']).\
                                        pivot('btag', ['pv', 'fav', 'cart', 'buy']).\
                                        count() # groupBy.count()对每一类行为统计频数
cate_count_df.show()

+-------+----+---+----+----+----+
|   user|cate| pv| fav|cart| buy|
+-------+----+---+----+----+----+
| 738396|9687| 12|null|   1|null|
| 590694|6432|  3|null|null|null|
|1056983|4283|  9|null|   1|null|
| 950232|5953|  1|null|null|null|
| 386718|9295|  5|null|null|null|
| 370007|6426|  1|null|null|null|
|  38456|6423|  1|null|null|null|
| 159877|4285|  1|null|null|null|
| 300675|5339|  2|   1|null|null|
|1113161|4290|  1|null|null|null|
| 898724|4520|  1|null|null|null|
| 569644|1665|  1|null|null|null|
|   8267|5945|  1|null|null|null|
| 267476|6009|  7|null|null|null|
| 873191|5942|  1|null|null|null|
| 609520|8964|  1|null|null|null|
| 767414|6409| 17|   1|null|null|
|  62553|6423|  1|null|null|null|
|1062120|9447|  1|null|null|null|
| 109275|6438|  1|null|null|null|
+-------+----+---+----+----+----+
only showing top 20 rows

根据用户对类目偏好打分训练ALS模型

# 遍历每一行，对行为进行线性加权，得到 用户-物品 评分
def process_row(row):
    # 判断某个属性是否有计数，不为空
    get_no_none = lambda cate: row[cate] if row[cate] else .0
    pv_count = get_no_none('pv')
    fav_count = get_no_none('fav')
    cart_count = get_no_none('cart')
    buy_count = get_no_none('buy')
    
    # 设置不同行为的权重
    pv_score = 0.2*pv_count if pv_count<=20 else 4.0
    fav_score = 0.4*fav_count if fav_count<=20 else 8.0
    cart_score = 0.6*cart_count if cart_count<=20 else 12.0
    buy_score = 1.0*buy_count if buy_count<=20 else 20.0
    
    score = pv_score + fav_score + cart_score + buy_score
    return row['user'], row['cate'], score


cate_score_df = cate_count_df.rdd.map(process_row).toDF(['user', 'cate', 'score'])

cate_score_df.show()

+-------+----+------------------+
|   user|cate|             score|
+-------+----+------------------+
| 738396|9687|3.0000000000000004|
| 590694|6432|0.6000000000000001|
|1056983|4283|               2.4|
| 950232|5953|               0.2|
| 386718|9295|               1.0|
| 370007|6426|               0.2|
|  38456|6423|               0.2|
| 159877|4285|               0.2|
| 300675|5339|               0.8|
|1113161|4290|               0.2|
| 898724|4520|               0.2|
| 569644|1665|               0.2|
|   8267|5945|               0.2|
| 267476|6009|1.4000000000000001|
| 873191|5942|               0.2|
| 609520|8964|               0.2|
| 767414|6409|3.8000000000000003|
|  62553|6423|               0.2|
|1062120|9447|               0.2|
| 109275|6438|               0.2|
+-------+----+------------------+
only showing top 20 rows

%%time
from pyspark.ml.recommendation import ALS

# 如果不用 spark 可以使用 implicit 库，也有 ALS 算法
# implicit 库 生成 用户-物品 评分表，也是对应的处理步骤
als = ALS(userCol='user', itemCol='cate', ratingCol='score', checkpointInterval=5)
model = als.fit(cate_score_df)

Wall time: 1min 53s

%%time
# 为每个用户推荐 Top-N 个物品
recommend_df = model.recommendForAllUsers(5)
recommend_df.show()

+-----+--------------------+
| user|     recommendations|
+-----+--------------------+
|  463|[[6455, 2.9882045...|
|  471|[[9558, 0.3299492...|
|  833|[[1720, 0.2581508...|
| 1238|[[12528, 0.681818...|
| 1829|[[5595, 4.1331573...|
| 2142|[[8558, 0.4138768...|
| 2659|[[9953, 0.2536223...|
| 4519|[[8652, 0.3156553...|
| 4935|[[8652, 0.6086501...|
| 6658|[[7777, 0.3495224...|
| 7340|[[5595, 1.4034023...|
| 7993|[[12063, 3.548402...|
|11317|[[8652, 0.9129751...|
|11458|[[9141, 1.4156919...|
|13840|[[8652, 1.8939321...|
|17420|[[11831, 0.272165...|
|18051|[[5595, 0.3890690...|
|18654|[[1802, 0.2476846...|
|18979|[[7777, 0.6990449...|
|19079|[[11831, 0.272165...|
+-----+--------------------+
only showing top 20 rows

Wall time: 54.5 s

recommend_df.select('recommendations').show(1)

+--------------------+
|     recommendations|
+--------------------+
|[[6455, 2.9882045...|
+--------------------+
only showing top 1 row

model.save('als_model.obj')

df = spark.read.csv(r"D:\阿里ctr预估数据集\raw_sample.csv", header=True)
df.show()
df.printSchema()

+------+----------+----------+-----------+------+---+
|  user|time_stamp|adgroup_id|        pid|nonclk|clk|
+------+----------+----------+-----------+------+---+
|581738|1494137644|         1|430548_1007|     1|  0|
|449818|1494638778|         3|430548_1007|     1|  0|
|914836|1494650879|         4|430548_1007|     1|  0|
|914836|1494651029|         5|430548_1007|     1|  0|
|399907|1494302958|         8|430548_1007|     1|  0|
|628137|1494524935|         9|430548_1007|     1|  0|
|298139|1494462593|         9|430539_1007|     1|  0|
|775475|1494561036|         9|430548_1007|     1|  0|
|555266|1494307136|        11|430539_1007|     1|  0|
|117840|1494036743|        11|430548_1007|     1|  0|
|739815|1494115387|        11|430539_1007|     1|  0|
|623911|1494625301|        11|430548_1007|     1|  0|
|623911|1494451608|        11|430548_1007|     1|  0|
|421590|1494034144|        11|430548_1007|     1|  0|
|976358|1494156949|        13|430548_1007|     1|  0|
|286630|1494218579|        13|430539_1007|     1|  0|
|286630|1494289247|        13|430539_1007|     1|  0|
|771431|1494153867|        13|430548_1007|     1|  0|
|707120|1494220810|        13|430548_1007|     1|  0|
|530454|1494293746|        13|430548_1007|     1|  0|
+------+----------+----------+-----------+------+---+
only showing top 20 rows

root
 |-- user: string (nullable = true)
 |-- time_stamp: string (nullable = true)
 |-- adgroup_id: string (nullable = true)
 |-- pid: string (nullable = true)
 |-- nonclk: string (nullable = true)
 |-- clk: string (nullable = true)