离线图书推荐,使用sparkMLlib ALS协同过滤算法

最新推荐文章于 2024-08-07 23:30:48 发布

呆大头

最新推荐文章于 2024-08-07 23:30:48 发布

阅读量565

点赞数

文章标签： spark

本文链接：https://blog.csdn.net/qq_43561757/article/details/105957800

版权

图书推荐，使用sparkMLlib ALS协同过滤算法，bookcrossing数据集，Scala语言，虚拟机ieda平台，代码参照HADOOP大数据实战权威指南第十一章，只能说算是能运行出结果了吧
package com.csu
import java.util.Random
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating}
import org.apache.spark.rdd._

object bookALS {
def main(args: Array[String]) {
if (args.length != 1) {
println(“Please input moveLens Home directory, e.g: /tmp/data/”)
System.exit(1)
}
val bookHomeDir = args(0)
if(!new java.io.File(bookHomeDir + “/bookrating.csv”).exists) {
println(“File rating.dat is not exist under directory:” + bookHomeDir)
System.exit(1)
}
if(!new java.io.File(bookHomeDir + “/book.csv”).exists) {
println(“File movies.dat is not exist under directory:” + bookHomeDir)
System.exit(1)
//val conf = new SparkConf()
// .setMaster(“local”)
// .setAppName(“bookALS”)
// val sc = new SparkContext(conf)
//ratings_file_path = os.path.join(“data”, ‘bookrating.csv’)
//val ratings_raw_RDD = sc.textFile(“data”+“bookrating.csv”)
//val ratings_raw_data_header = ratings_raw_RDD.take(1)[0]
// val ratings_RDD = ratings_raw_RDD.map(line =>
// : line.split(";")).map(lambda tokens: (int(tokens[0][1:-1]), abs(hash(tokens[1][1:-1])) % (10 ** 8),float(tokens[2][1:-1]))).cache()
// books_file_path = os.path.join(dataset_path, ‘BX-Books.csv’)
// books_raw_RDD = sc.textFile(books_file_path)
//books_raw_data_header = books_raw_RDD.take(1)[0]
//books_RDD = books_raw_RDD.filter(lambda line: line!=books_raw_data_header)
// .map(lambda line: line.split(";"))
// .map(lambda tokens: (abs(hash(tokens[0][1:-1])) % (10 ** 8), tokens[1][1:-1], tokens[2][1:-1], tokens[3][1:-1], tokens[4][1:-1], tokens[5][1:-1])).cache()
// books_titles_RDD = books_RDD.map(lambda x: (int(x[0]), x[1], x[2], x[3], x[4], x[5])).cache()
}
// set up environment
val conf = new SparkConf()
.setMaster(“local”)
.setAppName(“bookALS”)
.set(“spark.executor.memory”,“1024m”)
val sc = new SparkContext(conf)
// load ratings and movie titles
val ratings = sc.textFile(bookHomeDir + “/bookrating.csv”).map { line =>
val fields = line.replaceAll(""","").split(";")
if (fields(2)==0){fields(2)=fields(2).replaceAll(“0”,“1”)
}
fields(1)=Math.abs(fields(1).hashCode()).toString
// format: (timestamp % 10, Rating(userId, movieId, rating))
(fields(2).toLong %10, Rating(fields(0).toInt, fields(1).toInt%(108), fields(2).toDouble))
}.cache()
val movies = sc.textFile(bookHomeDir + “/book.csv”).map { line =>
val fields = line.replaceAll(""","").split(";")
// format: (movieId, movieName)
fields(0)=Math.abs(fields(0).hashCode()).toString
(fields(0).toInt%(108), fields(1))
}.cache().collect.toMap
val numRatings = ratings.count
val numUsers = ratings.map(_.2.user).distinct.count
val numMovies = ratings.map(.2.product).distinct.count
println(“Got " + numRatings + " ratings from "
+ numUsers + " users on " + numMovies + " movies.”)
// sample a subset of most rated movies for rating elicitation
val mostRatedMovieIds = ratings.map(._2.product) // extract movie ids
.countByValue // count ratings per movie
.toSeq // convert map to Seq
.sortBy(- _.2) // sort by rating count
.take(20) // take 50 most rated
.map(._1) // get their ids
val random = new Random(0)
val selectedMovies = mostRatedMovieIds.filter(x => random.nextDouble() < 0.2)
.map(x => (x, movies(x)))
.toSeq
// elicitate ratings
val myRatings = elicitateRatings(selectedMovies)
val myRatingsRDD = sc.parallelize(myRatings, 1)
// split ratings into train (60%), validation (20%), and test (20%) based on the
// last digit of the timestamp, add myRatings to train, and cache them
val numPartitions = 20
val training = ratings.filter(x => x._1 < 6)
.values
.union(myRatingsRDD)
.repartition(numPartitions)
.persist
val validation = ratings.filter(x => x.1 >= 6 && x.1 < 8)
.values
.repartition(numPartitions)
.persist
val test = ratings.filter(x => x.1 >= 8).values.persist
val numTraining = training.count
val numValidation = validation.count
val numTest = test.count
println("Training: " + numTraining + ", validation: " + numValidation + ", test: " + numTest)
// train models and evaluate them on the validation set
val ranks = List(8, 10)
val lambdas = List(0.1, 10.0)
val numIters = List(10, 15)
var bestModel: Option[MatrixFactorizationModel] = None
var bestValidationRmse = Double.MaxValue
var bestRank = 0
var bestLambda = -1.0
var bestNumIter = -1
for (rank <- ranks; lambda <- lambdas; numIter <- numIters) {
val model = ALS.train(training, rank, numIter, lambda)
val validationRmse = computeRmse(model, validation, numValidation)
println("RMSE (validation) = " + validationRmse + " for the model trained with rank = "
+ rank + ", lambda = " + lambda + ", and numIter = " + numIter + “.”)
if (validationRmse < bestValidationRmse) {
bestModel = Some(model)
bestValidationRmse = validationRmse
bestRank = rank
bestLambda = lambda
bestNumIter = numIter
}
}
// evaluate the best model on the test set
val testRmse = computeRmse(bestModel.get, test, numTest)
println("The best model was trained with rank = " + bestRank + " and lambda = " + bestLambda
+ ", and numIter = " + bestNumIter + ", and its RMSE on the test set is " + testRmse + “.”)
// create a naive baseline and compare it with the best model
val meanRating = training.union(validation).map(.rating).mean
val baselineRmse = math.sqrt(test.map(x => (meanRating - x.rating) * (meanRating - x.rating))
.reduce( + ) / numTest)
val improvement = (baselineRmse - testRmse) / baselineRmse * 100
println("The best model improves the baseline by " + “%1.2f”.format(improvement) + “%.”)
// make personalized recommendations
val myRatedMovieIds = myRatings.map(.product).toSet
val candidates = sc.parallelize(movies.keys.filter(!myRatedMovieIds.contains()).toSeq)
val recommendations = bestModel.get
.predict(candidates.map((0, _)))
.collect
.sortBy(- _.rating)
.take(10)
var i = 1
println(“Movies recommended for you:”)
recommendations.foreach { r =>
println("%2d".format(i) + ": " + movies(r.product))
i += 1
}
// clean up
sc.stop()
}
/** Compute RMSE (Root Mean Squared Error). /
def computeRmse(model: MatrixFactorizationModel, data: RDD[Rating], n: Long) = {
val predictions: RDD[Rating] = model.predict(data.map(x => (x.user, x.product)))
val predictionsAndRatings = predictions.map(x => ((x.user, x.product), x.rating))
.join(data.map(x => ((x.user, x.product), x.rating)))
.values
math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x.2)).reduce( + _) / n)
}
/* Elicitate ratings from command-line. */
def elicitateRatings(movies: Seq[(Int, String)]) = {
val prompt = “Please rate the following movie (1-5 (best), or 0 if not seen):”
println(prompt)
val ratings = movies.flatMap { x =>
var rating: Option[Rating] = None
var valid = false
while (!valid) {
print(x._2 + ": ")
try {
val r = Console.readInt
if (r < 0 || r > 10) {
println(prompt)
} else {
valid = true
if (r > 0) {
rating = Some(Rating(0, x._1, r))
}
}
} catch {
case e: Exception => println(prompt)
}
}
rating match {
case Some® => Iterator®
case None => Iterator.empty
}
}
if(ratings.isEmpty) {
error(“No rating provided!”)
} else {
ratings
}
}
}
图书数据:

Institut für Informatik, Universität Freiburg - Book Ratings Data Sets O网页链接

呆大头

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
离线图书推荐,使用sparkMLlib ALS协同过滤算法

图书推荐，使用sparkMLlib ALS协同过滤算法，bookcrossing数据集，Scala语言，虚拟机ieda平台，代码参照HADOOP大数据实战权威指南第十一章，只能说算是能运行出结果了吧package com.csuimport java.util.Randomimport org.apache.spark.{SparkConf, SparkContext}import or...
复制链接

扫一扫