协同过滤简单来说就是利用某个兴趣相投、拥有共同经验之群体的喜好来为使用户推荐其感兴趣的资讯,个人通过合作的机制给予咨询相当程度的回应并记录下来达到过滤的目的,进而帮助别人筛选。
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.recommendation.{ALS, ALSModel}
import org.apache.spark.sql.{DataFrame, Encoders, SparkSession}
//这个类必须定义在单例对象外面,否则就会报错
case class Rating2(userId:Int,movieId:Int,rating: Float,timestamp:Long)
object ALSDemo {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().config("spark.testing.memory", "2147480000")
.master("local[*]")
.appName("sda")
.getOrCreate()
import spark.implicits._
// 因为读入的数据默认都是字符格式,故需要对数据格式进行格式转换
// 定义个类,来保存一次评分
// 把一行转换成一个评分类
def parseRating(str:String): Rating2 = {
val fields = str.split("\t")
assert(fields.size == 4)
Rating2(fields(0).toInt,fields(1).toInt,fields(2).toFloat,fields(3).toLong)
}
// 读取并缓存数据
val ratings = spark.read.textFile("D:\\hadoop\\spark\\ml-100k\\u.data").map(parseRating)
.toDF("userId","movieId","rating","timestamp").cache()
ratings.show(5)
ratings.describe("userId","movieId","rating").show()
// 训练数据
val Array(training,test) = ratings.randomSplit(Array(0.8,0.2),seed=1234)
// als的参数:numBlocks:用于并行化计算的分块个数,rank是模型中隐语因子的个数,maxIter:迭代次数(当机器计算能力不够建议设小不然出错),regParam:正则化参数
// implicitPrefs:该参数适用显性反馈,还是适用隐性反馈,默认false即显性反馈
// alpha改参数决定偏好行为强度的基准
// nonnegative对最小二乘法适用非负的限制
//
val als = new ALS().setMaxIter(5)
.setRank(10)
.setRegParam(0.01)
.setNonnegative(true)
.setUserCol("userId")
.setItemCol("movieId")
.setRatingCol("rating")
// 创建流水线,把数据组装,模型训练等任务组装在一条流水线上。
val pipeline = new Pipeline().setStages(Array(als))
// 训练模型
val model = pipeline.fit(training)
// 做出预测
val predictions = model.transform(test)
// 查看预测值与原来的值
predictions.show(5)
// 评估模型
// 预测时会产生NaN,NaN表示不推荐,2.1版本前有,2.2版本后修复
predictions.filter(predictions("prediction").isNaN).select("userId","movieId","rating","prediction").count()
// 删除含NaN的行,NaN有一定合理性,不推荐,但为评估指标,可以先过滤这些数
val predictions1 = predictions.na.drop()
val evaluator = new RegressionEvaluator().setMetricName("rmse").setLabelCol("rating").setPredictionCol("prediction")
val rmse = evaluator.evaluate(predictions1)
println(rmse)
// 模型优化
// 将样本评分表分成三个部分,分别用于训练,校验,测试
val splits = ratings.randomSplit(Array(0.6,0.2,0.2),12)
// 把训练样本缓存起来
val training2 = splits(0).cache()
val validation2 = splits(1).toDF().cache()
val test2 = splits(2).toDF().cache()
// 计算各集合总数
val numTraining = training.count()
val numValidation = validation2.count()
val numTest = test2.count()
// 训练不同参数下的模型,并在校验集中验证,获取最佳参数的模型
val ranks = List(10,20)
val lambdas = List(0.01,0.1)
val numIters = List(2,4)
var bestModel:Option[ALSModel] = None
var bestValidationRmse = Double.MaxValue
var bestRank = 0
var bestLambda = 1.0
var bestNumIter = 1
def computeRmse(model:ALSModel,data:DataFrame,n:Long):Double = {
val predictions = model.transform(data)
val p1 = predictions.na.drop().rdd.map {x => ((x(0),x(1)),x(2))}.join(predictions.rdd.map{x=>((x(0),x(1)),x(4))}).values
math.sqrt(p1.map(x=>(x._1.toString.toDouble-x._2.toString.toDouble)*(x._1.toString.toDouble-x._2.toString.toDouble)).reduce(_+_)/n)
}
for (rank<-ranks; lambda<-lambdas; numIter<-numIters){
val als = new ALS()
.setMaxIter(numIter)
.setRegParam(rank)
.setNonnegative(true)
.setUserCol("userId").setItemCol("movieId").setRatingCol("rating")
val model = als.fit(training)
val validationRmse = computeRmse(model,validation2,numValidation)
println("RMSE(validation) = " + validationRmse + " for the model trained with rank = " + rank + ",lambda = " + lambda
+ ",and numIter = " + numIter + ".")
if (validationRmse < bestValidationRmse){
bestModel = Some(model)
bestValidationRmse = validationRmse
bestRank = rank
bestLambda = lambda
bestNumIter = numIter
}
}
}
}