大数据一个重要的应用是预测用户喜好,例如相关广告的推送、相关产品的推荐、相关图书电影的推荐等。这里我们使用Spark的机器学习来展示如何进行预测,并基于此快速构建一个电影评分及推荐应用。
找到文件
import os
from databricks_test_helper import Test
dbfs_dir = '...'
ratings_filename = dbfs_dir + '/ratings.csv'
movies_filename = dbfs_dir + '/movies.csv'
if os.path.sep != '/':
ratings_filename = ratings_filename.replace('/', os.path.sep)
movie_filename = movie_filename.replace('/', os.path.sep)
定义数据 架构
from pyspark.sql.types import *
ratings_df_schema = StructType(
[StructField('userId', IntegerType()),
StructField('movieId', IntegerType()),
StructField('rating', DoubleType())]
)
movies_df_schema = StructType(
[StructField('ID', IntegerType()),
StructField('title', StringType())]
)
载入数据并缓存
from pyspark.sql.functions import regexp_extract
from pyspark.sql.types import *
raw_ratings_df = sqlContext.read.format('com.databricks.spark.csv').options(header=True, inferSchema=False).schema(ratings_df_schema).load(ratings_filename)
ratings_df = raw_ratings_df.drop('Timestamp')
raw_movies_df = sqlContext.read.format('com.databricks.spark.csv').options(header=True, inferSchema=False).schema(movies_df_schema).load(movies_filename)
movies_df = raw_movies_df.drop('Genres').withColumnRenamed('movieId', 'ID')
ratings_df.cache()
movies_df.cache()
assert ratings_df.is_cached
assert movies_df.is_cached
raw_ratings_count = raw_ratings_df.count()
ratings_count = ratings_df.count()
raw_movies_count = raw_movies_df.count()
movies_count = movies_df.count()
print 'There are %s ratings and %s movies in the datasets' % (ratings_count, movies_count)
print 'Ratings:'
ratings_df.show(3)
print 'Movies:'
movies_df.show(3, truncate=False)
assert raw_ratings_count == ratings_count
assert raw_movies_count == movies_count
快速浏览一下数据结构
display(movies_df)
平均分最高的电影
from pyspark.sql import functions as F
movie_ids_with_avg_ratings_df = ratings_df.groupBy('movieId').agg(F.count(ratings_df.rating).alias("count"), F.avg(ratings_df.rating).alias("average"))
print 'movie_ids_with_avg_ratings_df:'
movie_ids_with_avg_ratings_df.show(3, truncate=False)
movie_names_df = movie_ids_with_avg_ratings_df.join(movies_df, movie_ids_with_avg_ratings_df.movieId == movies_df.ID)
movie_names_with_avg_ratings_df = movie_names_df.select('average', 'title', 'count', 'movieId')
print 'movie_names_with_avg_ratings_df:'
movie_names_with_avg_ratings_df.show(3, truncate=False)
结果类似于
movie_ids_with_avg_ratings_df:
+-------+-----+------------------+
|movieId|count|average |
+-------+-----+------------------+
|1831 |7463 |2.5785207021305103|
|431 |8946 |3.695059244355019 |
|631 |2193 |2.7273141814865483|
+-------+-----+------------------+
only showing top 3 rows
movie_names_with_avg_ratings_df:
+-------+-----------------------------+-----+-------+
|average|title |count|movieId|
+-------+-----------------------------+-----+-------+
|5.0 |Ella Lola, a la Trilby (1898)|1 |94431 |
|5.0 |Serving Life (2011) |1 |129034 |
|5.0 |Diplomatic Immunity (2009? ) |1 |107434 |
+-------+-----------------------------+-----+-------+
only showing top 3 rows
有500条以上评价平均分最高的电影
movies_with_500_ratings_or_more = movie_names_with_avg_ratings_df.filter('count>=500')
print 'Movies with highest ratings:'
movies_with_500_ratings_or_more.show(20, truncate=False)
协作过滤
协作过滤是一种利用用户整体(协作)偏好信息来进行自动预测的方法。它基于这样的假设,如果用户A和用户B在某个问题上意见一致,那么A与B在另一问题X上更有可能意见一致。
首先我们将评价数据分割成训练集、验证集和测试集
seed = 43
(split_60_df, split_a_20_df, split_b_20_df) = ratings_df.randomSplit([0.6, 0.2, 0.2], seed)
training_df = split_60_df.cache()
validation_df = split_a_20_df.cache()
test_df = split_b_20_df.cache()
print('Training: {0}, validation: {1}, test: {2}\n'.format(
training_df.count(), validation_df.count(), test_df.count())
)
training_df.show(3)
validation_df.show(3)
test_df.show(3)
使用ALS函数并设置参数
from pyspark.ml.recommendation import ALS
als = ALS()
als.setMaxIter(5)\
.setSeed(seed)\
.setRegParam(0.1)\
.setUserCol('userId')\
.setItemCol('movieId')\
.setRatingCol('rating')
from pyspark.ml.evaluation import RegressionEvaluator
reg_eval = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="rmse")
tolerance = 0.03
ranks = [4, 8, 12]
errors = [0, 0, 0]
models = [0, 0, 0]
err = 0
min_error = float('inf')
best_rank = -1
for rank in ranks:
als.setRank(rank)
model = als.fit(training_df)
predict_df = model.transform(validation_df)
predicted_ratings_df = predict_df.filter(predict_df.prediction != float('nan'))
error = reg_eval.evaluate(predicted_ratings_df)
errors[err] = error
models[err] = model
print 'For rank %s the RMSE is %s' % (rank, error)
if error < min_error:
min_error = error
best_rank = err
err += 1
als.setRank(ranks[best_rank])
print 'The best model was trained with rank %s' % ranks[best_rank]
my_model = models[best_rank]
测试模型
predict_df = my_model.transform(test_df)
predicted_test_df = predict_df.filter(predict_df.prediction != float('nan'))
test_RMSE = reg_eval.evaluate(predicted_test_df)
print('The model had a RMSE on the test set of {0}'.format(test_RMSE))
比较模型
from pyspark.sql.functions import lit
avg_rating_df = training_df.groupBy().avg('rating')
training_avg_rating = avg_rating_df.collect()[0][0]
print('The average rating for movies in the training set is {0}'.format(training_avg_rating))
test_for_avg_df = training_df.withColumn('prediction', lit(training_avg_rating))
test_avg_RMSE = reg_eval.evaluate(test_for_avg_df)
print("The RMSE on the average set is {0}".format(test_avg_RMSE))
进行新的预测
评价你的电影
from pyspark.sql import Row
my_user_id = 0
my_rated_movies = [
(my_user_id, 318, 4), (my_user_id, 858, 4), (my_user_id, 527, 4),
(my_user_id, 1221, 4), (my_user_id, 260, 4), (my_user_id, 1196, 4),
(my_user_id, 2571, 5), (my_user_id, 94466, 5), (my_user_id, 593, 4),
(my_user_id, 1197, 4)
]
my_ratings_df = sqlContext.createDataFrame(my_rated_movies, ['userId','movieId','rating'])
print 'My movie ratings:'
display(my_ratings_df.limit(10))
将你的电影评价加入训练集
training_with_my_ratings_df = training_df.unionAll(my_ratings_df)
print ('The training dataset now has %s more entries than the original training dataset' %
(training_with_my_ratings_df.count() - training_df.count()))
assert (training_with_my_ratings_df.count() - training_df.count()) == my_ratings_df.count()
使用模型进行训练
als.setPredictionCol("prediction")\
.setMaxIter(5)\
.setSeed(seed)\
.setRegParam(0.1)\
.setUserCol('userId')\
.setItemCol('movieId')\
.setRatingCol('rating')
my_ratings_model = als.fit(training_with_my_ratings_df)
检查新模型的RMSE
my_predict_df = my_ratings_model.transform(test_df)
predicted_test_my_ratings_df = my_predict_df.filter(my_predict_df.prediction != float('nan'))
test_RMSE_my_ratings = reg_eval.evaluate(predicted_test_my_ratings_df)
print('The model had a RMSE on the test set of {0}'.format(test_RMSE_my_ratings))
预测你对新电影的评分
my_rated_movie_ids = [x[1] for x in my_rated_movies]
not_rated_df = movies_df.filter(~ movies_df['ID'].isin(my_rated_movie_ids))
my_unrated_movies_df = not_rated_df.withColumnRenamed('ID', 'movieId').withColumn('userId', lit(my_user_id))
raw_predicted_ratings_df = my_ratings_model.transform(my_unrated_movies_df)
predicted_ratings_df = raw_predicted_ratings_df.filter(raw_predicted_ratings_df['prediction'] != float('nan'))
输出评价最高的产品
predicted_with_counts_df = predicted_ratings_df.join(movie_names_with_avg_ratings_df, 'movieId')
predicted_highest_rated_movies_df = predicted_with_counts_df.orderBy('prediction', ascending=False).filter('count>75')
print ('My 25 highest rated movies as predicted (for movies with more than 75 reviews):')
predicted_highest_rated_movies_df.show(25)