spark里面的调用
//Rmse均方根误差 root mean square error
//为预测值与实际值的差的平方和与次数的比值的平方根,可以用来评判模型的好坏
//计算均方根误差,模型 测试集需要格式为RDD[Rating]
//当然rmse越小越好
def RmseComputer(model: MatrixFactorizationModel, dataOfTest: RDD[Rating]): Double = {
//data 测试集为原来的rating随机分割的25% 格式为UserID::MovieID::Rating::Timestamp
//但模型预测只需要知道user 和 product即可 所以需要map操作转换一下,截取也可用下面的直接用case class
//截取,但那种太笨了!!!
//预测返回结果包括user product rating
val predictResult = model.predict(dataOfTest.map(x => (x.user, x.product)))
//此步比较复杂,主要是用来将预测值和测试值组成一个map然后比较预测的评分值和实际值
//最后的格式为 ((user,product),(prediction,actuallValue)).value
val predJoinTest = predictResult.map(x => ((x.user, x.product), x.rating)).join(dataOfTest.map(x => ((x.user, x.product), x.rating))).values
//直接调用库函数需要传入一个(prediction,actuallValue)
val evalutor = new RegressionMetrics(predJoinTest)
evalutor.meanAbsoluteError
}
//多重迭代法求最佳参数模型
//迭代次数
val numIters = List(10, 20)
//隐含因子
val numRanks = List(8, 12)
//惩罚值
val numLambdas = List(0.1, 10.0)
// Initial variable random value
//need set var because iteration so can variable
var bestModel: Option[MatrixFactorizationModel] = None
var bestRanks = -1
var bestIters = 0
var bestLambdas = -1.0
var bestRmse = Double.MaxValue
//共2*2*2种组合,每种组合迭代次数又不一样,所以在此会消耗大量时间
for (rank <- numRanks; iter <- numIters; lambdas <- numLambdas) {
//als参数为 训练集合 隐含因子 迭代次数 惩罚因子
val model = ALS.train(trainingSetOfRatingsData, rank, iter, lambdas)
val validationRmse = RmseComputer(model, testSetOfRatingData)
//类似与梯度下降,逐步迭代
if (validationRmse < bestRmse) {
//model 为option类型
bestModel = Some(model)
bestRmse = validationRmse
bestIters = iter
bestLambdas = lambdas
bestRanks = rank
}
}
<