假设有一个在线电影网站,公司希望运用大数据分析推荐引擎,增加会员观看电影次数。
ALS算法:
ALS算法是基于模型的推荐算法。起基本思想是对稀疏矩阵进行模型分解,评估出缺失项的值,以此来得到一个基本的训练模型。然后依照此模型可以针对新的用户和物品数据进行评估。ALS是采用交替的最小二乘法来算出缺失项的。交替的最小二乘法是在最小二乘法的基础上发展而来的。
根据用户对铲平项目的评分分为:
数据文件:
显示评分:
- 网站上用户对某个产品进行评分,如1~5颗星。
隐式评分:
- 不会请网站进行评分,但是会记录用户是否点选了某个产品。
创建Recommend项目:
1.创建Recommend.scala文件
2.导入链接库
import java.io.File
import scala.io.Source
import org.apache.log4j.Logger
import org.apache.log4j.Level
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
import org.apache.spark.mllib.recommendation.{ ALS, Rating, MatrixFactorizationModel }
3.业务逻辑代码
def recommend(model: MatrixFactorizationModel, movieTitle: Map[Int, String]) = {
var choose = ""
while (choose != "3") { //如果选择3.离开,就结束运行程序
print("请选择要推荐类型 1.针对用户推荐电影 2.针对电影推荐感兴趣的用户 3.离开?")
choose = readLine() //读取用户输入
if (choose == "1") { //如果输入1.针对用户推荐电影
print("请输入用户id?")
val inputUserID = readLine() //读取用户ID
RecommendMovies(model, movieTitle, inputUserID.toInt) //针对此用户推荐电影
} else if (choose == "2") { //如果输入2.针对电影推荐感兴趣的用户
print("请输入电影的 id?")
val inputMovieID = readLine() //读取MovieID
RecommendUsers(model, movieTitle, inputMovieID.toInt) //针对此电影推荐用户
}
}
}
main程序代码分为3部分:
- 数据准备阶段
- 训练阶段
- 推荐阶段
4.SetLogger设置不显示log信息
def SetLogger = {
Logger.getLogger("org").setLevel(Level.OFF)
Logger.getLogger("com").setLevel(Level.OFF)
System.setProperty("spark.ui.showConsoleProgress", "false")
Logger.getRootLogger().setLevel(Level.OFF);
}
5.创建PrepareData()函数
def PrepareData(): (RDD[Rating], Map[Int, String]) = {
val sc = new SparkContext(new SparkConf().setAppName("RDF").setMaster("local[4]"))
//spark在迭代计算的过程中,会导致linage剧烈变长,所需的栈空间也急剧上升,最终爆栈,显示指明checkpoint路径,问题便可得到解决。
sc.setCheckpointDir("checkpoint")
//----------------------1.创建用户评分数据-------------
print("开始读取用户评分数据中...")
//val DataDir = "data"
//val rawUserData = sc.textFile(new File(DataDir, "u.data").toString)
val rawUserData = sc.textFile("file:/home/hduser/SparkExample/Recommend/data/u.data")
val rawRatings = rawUserData.map(_.split("\t").take(3))
val ratingsRDD = rawRatings.map { case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) }
println("共计:" + ratingsRDD.count.toString() + "条ratings")
//----------------------2.创建电影ID与名称对照表-------------
print("开始读取电影数据中...")
//val itemRDD = sc.textFile(new File(DataDir, "u.item").toString)
val itemRDD = sc.textFile("file:/home/hduser/SparkExample/Recommend/data/u.item")
val movieTitle = itemRDD.map(line => line.split("\\|").take(2))
.map(array => (array(0).toInt, array(1))).collect().toMap
//----------------------3.显示数据记录数-------------
val numRatings = ratingsRDD.count()
val numUsers = ratingsRDD.map(_.user).distinct().count()
val numMovies = ratingsRDD.map(_.product).distinct().count()
println("共计:ratings: " + numRatings + " User " + numUsers + " Movie " + numMovies)
return (ratingsRDD, movieTitle)
}
6.recommend推荐程序代码
def recommend(model: MatrixFactorizationModel, movieTitle: Map[Int, String]) = {
var choose = ""
while (choose != "3") { //如果选择3.离开,就结束运行程序
print("请选择要推荐类型 1.针对用户推荐电影 2.针对电影推荐感兴趣的用户 3.离开?")
choose = readLine() //读取用户输入
if (choose == "1") { //如果输入1.针对用户推荐电影
print("请输入用户id?")
val inputUserID = readLine() //读取用户ID
RecommendMovies(model, movieTitle, inputUserID.toInt) //针对此用户推荐电影
} else if (choose == "2") { //如果输入2.针对电影推荐感兴趣的用户
print("请输入电影的 id?")
val inputMovieID = readLine() //读取MovieID
RecommendUsers(model, movieTitle, inputMovieID.toInt) //针对此电影推荐用户
}
}
}
7.main函数
def main(args: Array[String]) {
//设置不要显示多余信息
SetLogger
println("==========数据准备阶段===============")
val (ratings, movieTitle) = PrepareData()
println("==========训练阶段===============")
print("开始使用 " + ratings.count() + "条评比数据进行训练模型... ")
val model = ALS.train(ratings, 20, 15, 0.1)
println("训练完成!")
println("==========推荐阶段===============")
recommend(model, movieTitle)
println("完成")
}
8.Recommend.scala全部代码
import java.io.File
import scala.io.Source
import org.apache.log4j.Logger
import org.apache.log4j.Level
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
import org.apache.spark.mllib.recommendation.{ ALS, Rating, MatrixFactorizationModel }
object Recommend {
def main(args: Array[String]) {
//设置不要显示多余信息
SetLogger
println("==========数据准备阶段===============")
val (ratings, movieTitle) = PrepareData()
println("==========训练阶段===============")
print("开始使用 " + ratings.count() + "条评比数据进行训练模型... ")
val model = ALS.train(ratings, 20, 15, 0.1)
println("训练完成!")
println("==========推荐阶段===============")
recommend(model, movieTitle)
println("完成")
}
def recommend(model: MatrixFactorizationModel, movieTitle: Map[Int, String]) = {
var choose = ""
while (choose != "3") { //如果选择3.离开,就结束运行程序
print("请选择要推荐类型 1.针对用户推荐电影 2.针对电影推荐感兴趣的用户 3.离开?")
choose = readLine() //读取用户输入
if (choose == "1") { //如果输入1.针对用户推荐电影
print("请输入用户id?")
val inputUserID = readLine() //读取用户ID
RecommendMovies(model, movieTitle, inputUserID.toInt) //针对此用户推荐电影
} else if (choose == "2") { //如果输入2.针对电影推荐感兴趣的用户
print("请输入电影的 id?")
val inputMovieID = readLine() //读取MovieID
RecommendUsers(model, movieTitle, inputMovieID.toInt) //针对此电影推荐用户
}
}
}
def SetLogger = {
Logger.getLogger("org").setLevel(Level.OFF)
Logger.getLogger("com").setLevel(Level.OFF)
System.setProperty("spark.ui.showConsoleProgress", "false")
Logger.getRootLogger().setLevel(Level.OFF);
}
def PrepareData(): (RDD[Rating], Map[Int, String]) = {
val sc = new SparkContext(new SparkConf().setAppName("RDF").setMaster("local[4]"))
//spark在迭代计算的过程中,会导致linage剧烈变长,所需的栈空间也急剧上升,最终爆栈,显示指明checkpoint路径,问题便可得到解决。
sc.setCheckpointDir("checkpoint")
//----------------------1.创建用户评分数据-------------
print("开始读取用户评分数据中...")
//val DataDir = "data"
//val rawUserData = sc.textFile(new File(DataDir, "u.data").toString)
val rawUserData = sc.textFile("file:/home/hduser/SparkExample/Recommend/data/u.data")
val rawRatings = rawUserData.map(_.split("\t").take(3))
val ratingsRDD = rawRatings.map { case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) }
println("共计:" + ratingsRDD.count.toString() + "条ratings")
//----------------------2.创建电影ID与名称对照表-------------
print("开始读取电影数据中...")
//val itemRDD = sc.textFile(new File(DataDir, "u.item").toString)
val itemRDD = sc.textFile("file:/home/hduser/SparkExample/Recommend/data/u.item")
val movieTitle = itemRDD.map(line => line.split("\\|").take(2))
.map(array => (array(0).toInt, array(1))).collect().toMap
//----------------------3.显示数据记录数-------------
val numRatings = ratingsRDD.count()
val numUsers = ratingsRDD.map(_.user).distinct().count()
val numMovies = ratingsRDD.map(_.product).distinct().count()
println("共计:ratings: " + numRatings + " User " + numUsers + " Movie " + numMovies)
return (ratingsRDD, movieTitle)
}
def RecommendMovies(model: MatrixFactorizationModel, movieTitle: Map[Int, String], inputUserID: Int) = {
val RecommendMovie = model.recommendProducts(inputUserID, 10)
var i = 1
println("针对用户id" + inputUserID + "推荐下列电影:")
RecommendMovie.foreach { r =>
println(i.toString() + "." + movieTitle(r.product) + "评分:" + r.rating.toString())
i += 1
}
}
def RecommendUsers(model: MatrixFactorizationModel, movieTitle: Map[Int, String], inputMovieID: Int) = {
val RecommendUser = model.recommendUsers(inputMovieID, 10)
var i = 1
println("针对电影 id" + inputMovieID + "电影名:" + movieTitle(inputMovieID.toInt) + "推荐下列用户id:")
RecommendUser.foreach { r =>
println(i.toString + "用户id:" + r.user + " 评分:" + r.rating)
i = i + 1
}
}
}
9.运行 Recommend.scala
10.运行界面
11.针对用户推荐电影
12.针对电影推荐给感兴趣的人
注意:
如果不加 sc.setCheckpointDir("checkpoint"),则会栈溢出 stackoverflow。
spark在迭代计算的过程中,会导致linage剧烈变长,所需的栈空间也急剧上升,最终爆栈了。。
这类问题解决方法如下:
在代码中加入 sc.setCheckpointDir(path),显示指明checkpoint路径,问题便可得到解决。
参考链接:https://blog.csdn.net/asdfghjkl1993/article/details/78626439
13.创建AlsEvaluation.scala调校推荐引擎参数
分为三个阶段
- 数据准备阶段
- 训练评估阶段
- 测试阶段
14.创建PrepareData()数据准备
def PrepareData(): (RDD[Rating], RDD[Rating], RDD[Rating]) = {
val sc = new SparkContext(new SparkConf().setAppName("RDF").setMaster("local[4]"))
//----------------------1.创建用户评分数据-------------
print("开始读取用户评分数据...")
val DataDir = "data"
val rawUserData = sc.textFile(new File(DataDir, "u.data").toString)
val rawRatings = rawUserData.map(_.split("\t").take(3))
val ratingsRDD = rawRatings.map { case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) }
println("共计:" + ratingsRDD.count.toString() + "条ratings")
//----------------------2.创建电影ID与名称对照表-------------
print("开始读取电影数据...")
val itemRDD = sc.textFile(new File(DataDir, "u.item").toString)
val movieTitle = itemRDD.map(line => line.split("\\|").take(2))
.map(array => (array(0).toInt, array(1))).collect().toMap
//----------------------3.显示数据记录数-------------
val numRatings = ratingsRDD.count()
val numUsers = ratingsRDD.map(_.user).distinct().count()
val numMovies = ratingsRDD.map(_.product).distinct().count()
println("共计:ratings: " + numRatings + " User " + numUsers + " Movie " + numMovies)
//----------------------4.以随机方式将数据分为3个部分并且返回-------------
println("将数据分为")
val Array(trainData, validationData, testData) = ratingsRDD.randomSplit(Array(0.8, 0.1, 0.1))
println(" trainData:" + trainData.count() + " validationData:" + validationData.count() + " testData:" + testData.count())
return (trainData, validationData, testData)
}
15.进行训练评估
def trainValidation(trainData: RDD[Rating], validationData: RDD[Rating]): MatrixFactorizationModel = {
println("-----评估 rank参数使用 ---------")
evaluateParameter(trainData, validationData, "rank", Array(5, 10, 15, 20, 50, 100), Array(10), Array(0.1))
println("-----评估 numIterations ---------")
evaluateParameter(trainData, validationData, "numIterations", Array(10), Array(5, 10, 15, 20, 25), Array(0.1))
println("-----评估 lambda ---------")
evaluateParameter(trainData, validationData, "lambda", Array(10), Array(10), Array(0.05, 0.1, 1, 5, 10.0))
println("-----所有参数交叉评估找出最好的参数组合---------")
val bestModel = evaluateAllParameter(trainData, validationData, Array(5, 10, 15, 20, 25), Array(5, 10, 15, 20, 25), Array(0.05, 0.1, 1, 5, 10.0))
return (bestModel)
}
def evaluateParameter(trainData: RDD[Rating], validationData: RDD[Rating],
evaluateParameter: String, rankArray: Array[Int], numIterationsArray: Array[Int], lambdaArray: Array[Double]) =
{
var dataBarChart = new DefaultCategoryDataset()
var dataLineChart = new DefaultCategoryDataset()
for (rank <- rankArray; numIterations <- numIterationsArray; lambda <- lambdaArray) {
val (rmse, time) = trainModel(trainData, validationData, rank, numIterations, lambda)
val parameterData =
evaluateParameter match {
case "rank" => rank;
case "numIterations" => numIterations;
case "lambda" => lambda
}
dataBarChart.addValue(rmse, evaluateParameter, parameterData.toString())
dataLineChart.addValue(time, "Time", parameterData.toString())
}
Chart.plotBarLineChart("ALS evaluations " + evaluateParameter, evaluateParameter, "RMSE", 0.58, 5, "Time", dataBarChart, dataLineChart)
}
16.Chart.plotBarLineChart绘制出柱形图与折线图
import org.jfree.chart._
import org.jfree.data.xy._
import org.jfree.data.category.DefaultCategoryDataset
import org.jfree.chart.axis.NumberAxis
import org.jfree.chart.axis._
import java.awt.Color
import org.jfree.chart.renderer.category.LineAndShapeRenderer;
import org.jfree.chart.plot.DatasetRenderingOrder;
import org.jfree.chart.labels.StandardCategoryToolTipGenerator;
import java.awt.BasicStroke
object Chart {
def plotBarLineChart(Title: String, xLabel: String, yBarLabel: String, yBarMin: Double, yBarMax: Double, yLineLabel: String, dataBarChart : DefaultCategoryDataset, dataLineChart: DefaultCategoryDataset): Unit = {
//画出Bar Chart
val chart = ChartFactory
.createBarChart(
"", // Bar Chart 标题
xLabel, // X轴标题
yBarLabel, // Bar Chart 标题 y轴标题l
dataBarChart , // Bar Chart数据
org.jfree.chart.plot.PlotOrientation.VERTICAL,//画图方向垂直
true, // 包含 legend
true, // 显示tooltips
false // 不要URL generator
);
//取得plot
val plot = chart.getCategoryPlot();
plot.setBackgroundPaint(new Color(0xEE, 0xEE, 0xFF));
plot.setDomainAxisLocation(AxisLocation.BOTTOM_OR_RIGHT);
plot.setDataset(1, dataLineChart); plot.mapDatasetToRangeAxis(1, 1)
//画直方图y轴
val vn = plot.getRangeAxis(); vn.setRange(yBarMin, yBarMax); vn.setAutoTickUnitSelection(true)
//画折线图y轴
val axis2 = new NumberAxis(yLineLabel); plot.setRangeAxis(1, axis2);
val renderer2 = new LineAndShapeRenderer()
renderer2.setToolTipGenerator(new StandardCategoryToolTipGenerator());
//设置先画直方图,再画折线图以免折线图被盖掉
plot.setRenderer(1, renderer2);plot.setDatasetRenderingOrder(DatasetRenderingOrder.FORWARD);
//创建画框
val frame = new ChartFrame(Title,chart); frame.setSize(500, 500);
frame.pack(); frame.setVisible(true)
}
}
17.trainModel训练模型
def trainModel(trainData: RDD[Rating], validationData: RDD[Rating], rank: Int, iterations: Int, lambda: Double): (Double, Double) = {
val startTime = new DateTime()
val model = ALS.train(trainData, rank, iterations, lambda)
val endTime = new DateTime()
val Rmse = computeRMSE(model, validationData)
val duration = new Duration(startTime, endTime)
println(f"训练参数:rank:$rank%3d,iterations:$iterations%.2f ,lambda = $lambda%.2f 结果 Rmse=$Rmse%.2f" + "训练需要时间" + duration.getMillis + "毫秒")
(Rmse, duration.getStandardSeconds)
}
18.计算RMSE
EMSE是用来计算推荐系统对用户喜好的预测,与用户实际喜好的误差平均值,通常RMSE越小代表误差越小,即代表预测值与真实值越接近,准确度越高。
def computeRMSE(model: MatrixFactorizationModel, RatingRDD: RDD[Rating]): Double = {
val num = RatingRDD.count()
val predictedRDD = model.predict(RatingRDD.map(r => (r.user, r.product)))
val predictedAndRatings =
predictedRDD.map(p => ((p.user, p.product), p.rating))
.join(RatingRDD.map(r => ((r.user, r.product), r.rating)))
.values
math.sqrt(predictedAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce(_ + _) / num)
}
19.evaluateAllParameter找出最佳的参数组合
def evaluateAllParameter(trainData: RDD[Rating], validationData: RDD[Rating],
rankArray: Array[Int], numIterationsArray: Array[Int], lambdaArray: Array[Double]): MatrixFactorizationModel =
{
val evaluations =
for (rank <- rankArray; numIterations <- numIterationsArray; lambda <- lambdaArray) yield {
val (rmse, time) = trainModel(trainData, validationData, rank, numIterations, lambda)
(rank, numIterations, lambda, rmse)
}
val Eval = (evaluations.sortBy(_._4))
val BestEval = Eval(0)
println("最佳model参数:rank:" + BestEval._1 + ",iterations:" + BestEval._2 + "lambda" + BestEval._3 + ",结果rmse = " + BestEval._4)
val bestModel = ALS.train(trainData, BestEval._1, BestEval._2, BestEval._3)
(bestModel)
}
我们希望找出rank、numIterations、lambda,交叉评估找出最好的参数组合。
20.AlsEvaluation全部代码
import java.io.File
import scala.io.Source
import org.apache.log4j.Logger
import org.apache.log4j.Level
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
import org.apache.spark.mllib.recommendation.{ ALS, Rating, MatrixFactorizationModel }
import org.joda.time.format._
import org.joda.time._
import org.joda.time.Duration
import org.jfree.data.category.DefaultCategoryDataset
import org.apache.spark.mllib.regression.LabeledPoint
object AlsEvaluation {
def main(args: Array[String]) {
SetLogger
println("==========数据准备阶段===============")
val (trainData, validationData, testData) = PrepareData()
trainData.persist(); validationData.persist(); testData.persist()
println("==========训练验证阶段===============")
val bestModel = trainValidation(trainData, validationData)
println("==========测试阶段===============")
val testRmse = computeRMSE(bestModel, testData)
println("使用testData测试bestModel," + "结果rmse = " + testRmse)
trainData.unpersist(); validationData.unpersist(); testData.unpersist()
}
def trainValidation(trainData: RDD[Rating], validationData: RDD[Rating]): MatrixFactorizationModel = {
println("-----评估 rank参数使用 ---------")
evaluateParameter(trainData, validationData, "rank", Array(5, 10, 15, 20, 50, 100), Array(10), Array(0.1))
println("-----评估 numIterations ---------")
evaluateParameter(trainData, validationData, "numIterations", Array(10), Array(5, 10, 15, 20, 25), Array(0.1))
println("-----评估 lambda ---------")
evaluateParameter(trainData, validationData, "lambda", Array(10), Array(10), Array(0.05, 0.1, 1, 5, 10.0))
println("-----所有参数交叉评估找出最好的参数组合---------")
val bestModel = evaluateAllParameter(trainData, validationData, Array(5, 10, 15, 20, 25), Array(5, 10, 15, 20, 25), Array(0.05, 0.1, 1, 5, 10.0))
return (bestModel)
}
def evaluateParameter(trainData: RDD[Rating], validationData: RDD[Rating],
evaluateParameter: String, rankArray: Array[Int], numIterationsArray: Array[Int], lambdaArray: Array[Double]) =
{
var dataBarChart = new DefaultCategoryDataset()
var dataLineChart = new DefaultCategoryDataset()
for (rank <- rankArray; numIterations <- numIterationsArray; lambda <- lambdaArray) {
val (rmse, time) = trainModel(trainData, validationData, rank, numIterations, lambda)
val parameterData =
evaluateParameter match {
case "rank" => rank;
case "numIterations" => numIterations;
case "lambda" => lambda
}
dataBarChart.addValue(rmse, evaluateParameter, parameterData.toString())
dataLineChart.addValue(time, "Time", parameterData.toString())
}
Chart.plotBarLineChart("ALS evaluations " + evaluateParameter, evaluateParameter, "RMSE", 0.58, 5, "Time", dataBarChart, dataLineChart)
}
def evaluateAllParameter(trainData: RDD[Rating], validationData: RDD[Rating],
rankArray: Array[Int], numIterationsArray: Array[Int], lambdaArray: Array[Double]): MatrixFactorizationModel =
{
val evaluations =
for (rank <- rankArray; numIterations <- numIterationsArray; lambda <- lambdaArray) yield {
val (rmse, time) = trainModel(trainData, validationData, rank, numIterations, lambda)
(rank, numIterations, lambda, rmse)
}
val Eval = (evaluations.sortBy(_._4))
val BestEval = Eval(0)
println("最佳model参数:rank:" + BestEval._1 + ",iterations:" + BestEval._2 + "lambda" + BestEval._3 + ",结果rmse = " + BestEval._4)
val bestModel = ALS.train(trainData, BestEval._1, BestEval._2, BestEval._3)
(bestModel)
}
def PrepareData(): (RDD[Rating], RDD[Rating], RDD[Rating]) = {
val sc = new SparkContext(new SparkConf().setAppName("RDF").setMaster("local[4]"))
//spark在迭代计算的过程中,会导致linage剧烈变长,所需的栈空间也急剧上升,最终爆栈,显示指明checkpoint路径,问题便可得到解决。
sc.setCheckpointDir("checkpoint")
//----------------------1.创建用户评分数据-------------
print("开始读取用户评分数据...")
val DataDir = "data"
val rawUserData = sc.textFile(new File(DataDir, "u.data").toString)
val rawRatings = rawUserData.map(_.split("\t").take(3))
val ratingsRDD = rawRatings.map { case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) }
println("共计:" + ratingsRDD.count.toString() + "条ratings")
//----------------------2.创建电影ID与名称对照表-------------
print("开始读取电影数据...")
val itemRDD = sc.textFile(new File(DataDir, "u.item").toString)
val movieTitle = itemRDD.map(line => line.split("\\|").take(2))
.map(array => (array(0).toInt, array(1))).collect().toMap
//----------------------3.显示数据记录数-------------
val numRatings = ratingsRDD.count()
val numUsers = ratingsRDD.map(_.user).distinct().count()
val numMovies = ratingsRDD.map(_.product).distinct().count()
println("共计:ratings: " + numRatings + " User " + numUsers + " Movie " + numMovies)
//----------------------4.以随机方式将数据分为3个部分并且返回-------------
println("将数据分为")
val Array(trainData, validationData, testData) = ratingsRDD.randomSplit(Array(0.8, 0.1, 0.1))
println(" trainData:" + trainData.count() + " validationData:" + validationData.count() + " testData:" + testData.count())
return (trainData, validationData, testData)
}
def trainModel(trainData: RDD[Rating], validationData: RDD[Rating], rank: Int, iterations: Int, lambda: Double): (Double, Double) = {
val startTime = new DateTime()
val model = ALS.train(trainData, rank, iterations, lambda)
val endTime = new DateTime()
val Rmse = computeRMSE(model, validationData)
val duration = new Duration(startTime, endTime)
println(f"训练参数:rank:$rank%3d,iterations:$iterations%.2f ,lambda = $lambda%.2f 结果 Rmse=$Rmse%.2f" + "训练需要时间" + duration.getMillis + "毫秒")
(Rmse, duration.getStandardSeconds)
}
def computeRMSE(model: MatrixFactorizationModel, RatingRDD: RDD[Rating]): Double = {
val num = RatingRDD.count()
val predictedRDD = model.predict(RatingRDD.map(r => (r.user, r.product)))
val predictedAndRatings =
predictedRDD.map(p => ((p.user, p.product), p.rating))
.join(RatingRDD.map(r => ((r.user, r.product), r.rating)))
.values
math.sqrt(predictedAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce(_ + _) / num)
}
def SetLogger = {
Logger.getLogger("org").setLevel(Level.OFF)
Logger.getLogger("com").setLevel(Level.OFF)
System.setProperty("spark.ui.showConsoleProgress", "false")
Logger.getRootLogger().setLevel(Level.OFF);
}
}
21.运行AlsEvaluation
柱状图代表RMSE,折线图代表时间。
评估rank参数的结果图
评估numIterations
评估lambda
经过训练,所有参数交叉评估找出最好的参数组合
22.修改Recommend.scala为最佳参数组合