依赖
<properties>
<scala.binary.version>2.12</scala.binary.version>
<scala.version>2.12.11</scala.version>
<spark.version>3.0.0</spark.version>
<mllib.version>3.0.0</mllib.version>
<netlib.java.version>1.1.2</netlib.java.version>
<jblas.version>1.2.3</jblas.version>
<hadoop.version>3.1.3</hadoop.version>
</properties>
<dependencies>
<!-- Spark Core-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>3.0.0</version>
</dependency>
<!-- Spark SQL-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.12</artifactId>
<version>3.0.0</version>
</dependency>
<!-- Spark Yarn-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-yarn_2.12</artifactId>
<version>3.0.0</version>
</dependency>
<!-- Scala-->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<!-- Spark MLlib机器学习-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- Spark 线性统计学依赖库-->
<dependency>
<groupId>com.github.fommil.netlib</groupId>
<artifactId>all</artifactId>
<version>${netlib.java.version}</version>
<type>pom</type>
</dependency>
<dependency>
<groupId>org.jblas</groupId>
<artifactId>jblas</artifactId>
<version>${jblas.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>29.0-jre</version>
</dependency>
</dependencies>
实例代码
package com.dream.ml.rs.rdd
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* @title: SparkAlsRmdMovie
* @projectName SparkStudy
* @description: TODO
* @author MXH
* @date 2023/9/1 19:44
* 需求
* 使用MovieLens电影评分数据集,调用Spark MLlib中协同过滤推荐算法ALS建立推荐模型:
* (1)预测用户User对某个电影Product评价
* (2)为某个用户推荐10个电影Products
* (3)为某个电影推荐10个用户Users
*/
object SparkAlsRmdMovie {
def main(args: Array[String]): Unit = {
// 1.构建SparkContext实例对象
val sc: SparkContext = {
// 1.1 创建SparkConf对象,设置应用相关配置
val sparkConf: SparkConf = new SparkConf()
.setMaster("local[4]")
.setAppName(this.getClass.getSimpleName.stripSuffix("$"))
// 1.2创建SparkContext
val context: SparkContext = SparkContext.getOrCreate(sparkConf)
// 1.3设置检查点目录
context.setCheckpointDir(s"datas/ckpt/als-ml-${System.nanoTime()}")
// 1.4返回
context
}
// 2.读取电影评分数据
val rawRatingRDD: RDD[String] = sc.textFile(("datas/als/ml-100k/u.data"))
// Count = 100004
println(s"Count = ${rawRatingRDD.count()}")
// First: 196 242 3 881250949
println(s"First: ${rawRatingRDD.first()}")
// 3.数据转换,构建RDD[Rating]
// 3.1 构建RDD
val ratingsRDD: RDD[Rating] = rawRatingRDD
// 过滤不合格数据
.filter(line => null != line && line.split("\\t").length == 4)
.map {
line =>
// 字符串分割
val Array(userId, movieId, rating, _) = line.split("\\t")
//返回Rating实例对象
Rating(userId.toInt, movieId.toInt, rating.toDouble)
}
// 3.2划分数据集为训练数据集和测试数据库
val Array(trainRatings, testRatings) = ratingsRDD.randomSplit(Array(0.8, 0.2))
// 4.调用ALS算法中显示训练函数训练模型
// 迭代次数为20,特征数为10
val alsModel: MatrixFactorizationModel = ALS.train(
ratings = trainRatings, //训练数据集
rank = 10, // 特征数rank
iterations = 20 // 迭代次数
)
// 5.获取模型中两个因子矩阵
// 5.1用户因子矩阵
val userFeatures: RDD[(Int, Array[Double])] = alsModel.userFeatures
userFeatures.take(10).foreach(tuple =>{
println(tuple._1 + " -> " + tuple._2.mkString(","))
})
println("=======================================")
// 5.2产品因子矩阵
val productFeatures: RDD[(Int, Array[Double])] = alsModel.productFeatures
productFeatures.take(10).foreach(tuple =>{
println(tuple._1 + " -> " + tuple._2.mkString(","))
})
// 6.模型评估,使用RMSE评估模型,值越小,误差越小,模型越好
// 6.1 转换测试数据集格式RDD[((userId,productId),rating)]
val actualRatingsRDD: RDD[((Int, Int), Double)] = testRatings.map {
tuple => ((tuple.user, tuple.product), tuple.rating)
}
// 6.2 使用模型对测试数据集预测电影评分
val predictRatingsRDD: RDD[((Int, Int), Double)] = alsModel
// 依据UserId和ProductId预测评分
.predict(actualRatingsRDD.map(_._1))
.map(tuple => ((tuple.user, tuple.product), tuple.rating))
// 6.3 合并预测值与真实值
val predictAndActualRatingsRDD: RDD[((Int, Int), (Double, Double))] = predictRatingsRDD.join(actualRatingsRDD)
// 6.4 模型评估,计算RMSE值
// 均方根误差(Root Mean Square Error, RMSE):观测值与真值偏差的平方与观测次数N比值的平方根。
val metrics = new RegressionMetrics(predictAndActualRatingsRDD.map(_._2))
println(s"RMSE = ${metrics.rootMeanSquaredError}")
// 7.推荐与预测评分
// 7.1 预测某个用户对某个产品的评分
val predictRating: Double = alsModel.predict(user = 196,product = 242)
println(s"预测用户196对电影242的评分: ${predictRating}")
println("------------------------------------------")
// 7.2 为某个用户推荐10部电影
val rmdMovies: Array[Rating] = alsModel.recommendProducts(user = 196,num = 10)
rmdMovies.foreach(println)
println("------------------------------------------")
// 7.3 为某个电影推荐10个用户
val rmdUsers: Array[Rating] = alsModel.recommendUsers(product = 242,10)
rmdUsers.foreach(println)
// 8.将训练得到的模型进行保存,以便后期加载使用进行推荐
val modelPath = s"datas/als/ml-als-model-" + System.nanoTime()
alsModel.save(sc, modelPath)
// 9.从文件系统中加载保存的模型,用于推荐预测
val loadAlsModel: MatrixFactorizationModel = MatrixFactorizationModel.load(sc,modelPath)
// 使用加载预测
val loadPredictRating: Double = loadAlsModel.predict(user=196,product = 242)
println(s"加载模型预测用户196对电影242的评分: $loadPredictRating")
// 关闭资源 Shut down the SparkContext.
sc.stop()
}
}