Spark协同过滤
原理
参考
spark协同过滤
spark电影推荐算法with mllib
spark协同过滤 berkeley
数据集
movielen数据集
movielen 使用的数据集1 10,000 movies by 72,000 users.
movielen 使用的数据集2 6000 users on 4000 movies
步骤
1. 上传数据至hdfs
使用小数据集 ml-1m
行数 | 文件名 |
---|---|
3883 | ./movies.dat |
1000209 | ./ratings.dat |
159 | ./README |
6040 | ./users.dat |
文件说明
文件名 | 格式 | 示例 |
---|---|---|
movies.dat | MovieID::Title::Genres | 1::Toy Story (1995)::Animation|Children’s|Comedy |
ratings.dat | UserID::MovieID::Rating::Timestamp | 1::1193::5::978300760 |
users.dat | UserID::Gender::Age::Occupation::Zip-code | 1::F::1::10::48067 |
2. 编写scala,打包成jar
MovieLensALS.scala
import java.util.Random
import org.apache.log4j.Logger
import org.apache.log4j.Level
import scala.io.Source
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
import org.apache.spark.mllib.recommendation.{ALS, Rating, MatrixFactorizationModel}
object MovieLensALS {
def main(args: Array[String]) {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
if (args.length != 1) {
println("Usage: sbt/sbt package \"run movieLensHomeDir\"")
exit(1)
}
// set up environment
val jarFile = "target/scala-2.10/movielens-als_2.10-0.0.jar"
val sparkHome = "/root/spark"
val master = Source.fromFile("/root/spark-ec2/cluster-url").mkString.trim
val masterHostname = Source.fromFile("/root/spark-ec2/masters").mkString.trim
val conf = new SparkConf()
.setMaster(master)
.setSparkHome(sparkHome)
.setAppName("MovieLensALS")
.set("spark.executor.memory", "8g")
.setJars(Seq(jarFile))
val sc = new SparkContext(conf)
// load ratings and movie titles
val movieLensHomeDir = "hdfs://" + masterHostname + ":9000" + args(0)
val ratings = sc.textFile(movieLensHomeDir + "/ratings.dat").map { line =>
val fields = line.split("::")
// format: (timestamp % 10, Rating(userId, movieId, rating))
(fields(3).toLong % 10, Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble))
}
val movies = sc.textFile(movieLensHomeDir + "/movies.dat").map { line =>
val fields = line.split("::")
// format: (movieId, movieName)
(fields(0).toInt, fields(1))
}.collect.toMap
// your code here
// clean up
sc.stop();
}
/** Compute RMSE (Root Mean Squared Error). */
def computeRmse(model: MatrixFactorizationModel, data: RDD[Rating], n: Long) = {
// ...
}
/** Elicitate ratings from command-line. */
def elicitateRatings(movies: Seq[(Int, String)]) = {
// ...
}
}