在工程窗口内(如下图所示),在“recommend”文件夹上单击鼠标右键,在弹出的菜单中点击“New”,再选择“Scala Class”。
Q:没有“Scala Class”选项
W:点击file->project Structure->Global Libraries,重新添加scala sdk
然后继续新建(如果仍然没有,尝试重启)
然后,在弹出的对话框中(如下图所示),在“Name”中输入“MovieLensALS”,在“Kind”后面的下拉列表中选择“Object”。
然后,在新建的代码文件MovieLensALS.scala中输入如下代码:
package recommend
import java.io.File
import scala.io.Source
import org.apache.log4j.{ Level, Logger }
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.Rating
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
object MovieLensALS {
def main(args: Array[String]) {
// 屏蔽不必要的日志显示在终端上
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
if (args.length != 5) {
println("Usage: /usr/local/spark/bin/spark-submit --class recommend.MovieLensALS " +
"Spark_Recommend.jar movieLensHomeDir personalRatingsFile bestRank bestLambda bestNumiter")
sys.exit(1)
}
// 设置运行环境
val conf = new SparkConf().setAppName("MovieLensALS").setMaster("local[1]")
val sc = new SparkContext(conf)
// 装载参数二,即用户评分,该评分由评分器生成
val myRatings = loadRatings(args(1))
val myRatingsRDD = sc.parallelize(myRatings, 1)
// 样本数据目录
val movieLensHomeDir = args(0)
// 装载样本评分数据,其中最后一列Timestamp取除10的余数作为key,Rating为值,即(Int,Rating)
//ratings.dat原始数据:用户编号、电影编号、评分、评分时间戳
val ratings = sc.textFile(new File(movieLensHomeDir, "ratings.dat").toString).map { line =>
val fields = line.split("::")
(fields(3).toLong % 10, Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble))
}
//装载电影目录对照表(电影ID->电影标题)
//movies.dat原始数据:电影编号、电影名称、电影类别
val movies = sc.textFile(new File(movieLensHomeDir, "movies.dat").toString).map { line =>
val fields = line.split("::")
(fields(0).toInt, fields(1).toString())
}.collect().toMap
val numRatings = ratings.count()
val numUsers = ratings.map(_._2.user).distinct().count()
val numMovies = ratings.map(_._2.product).distinct().count()
// 将样本评分表以key值切分成3个部分,分别用于训练 (60%,并加入用户评分), 校验 (20%), and 测试 (20%)
// 该数据在计算过程中要多次应用到,所以cache到内存
val numPartitions = 4
// training训练样本数据
val training = ratings.filter(x => x._1 < 6) //取评分时间除10的余数后值小于6的作为训练样本
.values
.union(myRatingsRDD) //注意ratings是(Int,Rating),取value即可
.repartition(numPartitions)
.cache()
// validation校验样本数据
val validation = ratings.filter(x => x._1 >= 6 && x._1 < 8) //取评分时间除10的余数后值大于等于6且小于8分的作为校验样本
.values
.repartition(numPartitions)
.cache()
// test测试样本数据
val test = ratings.filter(x => x._1 >= 8).values.cache() //取评分时间除10的余数后值大于等于8分的作为测试样本
val numTraining = training.count()
val numValidation = validation.count()
val numTest = test.count()
// 训练不同参数下的模型,并在校验集中验证,获取最佳参数下的模型
val ranks = List(8, 12) //模型中隐语义因子的个数
val lambdas = List(0.1, 10.0) //是ALS的正则化参数
val numIters = List(10, 20) //迭代次数
var bestModel: Option[MatrixFactorizationModel] = None //最好的模型
var bestValidationRmse = Double.MaxValue //最好的校验均方根误差
var bestRank = args(2).toInt //最好的隐语义因子的个数
var bestLambda = args(3).toDouble //最好的ALS正则化参数
var bestNumIter = args(4).toInt //最好的迭代次数
//val model = ALS.train(training, bestRank, bestNumIter, bestLambda) //如果是从外部传入参数,则使用该语句训练模型
//如果不使用外部传入的参数,而是使用上面定义的ranks、lambdas和numIters的列表值进行模型训练,则使用下面的for循环语句训练模型
for (rank <- ranks; lambda <- lambdas; numIter <- numIters) {
val model = ALS.train(training, rank, numIter, lambda) //训练样本、隐语义因子的个数、迭代次数、ALS的正则化参数
// model训练模型
//输入训练模型、校验样本、校验个数
val validationRmse = computeRmse(model, validation, numValidation) // 校验模型结果
if (validationRmse < bestValidationRmse) {
bestModel = Some(model)
bestValidationRmse = validationRmse
bestRank = rank
bestLambda = lambda
bestNumIter = numIter
}
}
// 用最佳模型预测测试集的评分,并计算和实际评分之间的均方根误差
val testRmse = computeRmse(bestModel.get, test, numTest)
//创建一个naïve baseline和最好的模型比较
val meanRating = training.union(validation).map(_.rating).mean
val baselineRmse =
math.sqrt(test.map(x => (meanRating - x.rating) * (meanRating - x.rating)).mean)
//提高了baseline的最佳模型
val improvement = (baselineRmse - testRmse) / baselineRmse * 100
println("The best model improves the baseline by " + "%1.2f".format(improvement) + "%.")
// 推荐前5部最感兴趣的电影,注意要剔除用户已经评分的电影
val myRatedMovieIds = myRatings.map(_.product).toSet
val candidates = sc.parallelize(movies.keys.filter(!myRatedMovieIds.contains(_)).toSeq)
val recommendations = bestModel.get
.predict(candidates.map((1, _)))
.collect()
.sortBy(-_.rating)
.take(5)
var i = 1
println("Movies recommended for you(用户ID:推荐电影ID:推荐分数:推荐电影名称):")
recommendations.foreach { r =>
println( r.user + ":"+ r.product + ":"+ r.rating+":" + movies(r.product))
i += 1
}
val recommendations2 = bestModel.get
.predict(candidates.map((2, _)))
.collect()
.sortBy(-_.rating)
.take(5)
var i2 = 1
recommendations2.foreach { r =>
println( r.user + ":"+ r.product + ":"+ r.rating+":" + movies(r.product))
i2 += 1
}
val recommendations3 = bestModel.get
.predict(candidates.map((3, _)))
.collect()
.sortBy(-_.rating)
.take(5)
var i3 = 1
recommendations3.foreach { r =>
println( r.user + ":"+ r.product + ":"+ r.rating+":" + movies(r.product))
i3 += 1
}
val recommendations4 = bestModel.get
.predict(candidates.map((4, _)))
.collect()
.sortBy(-_.rating)
.take(5)
var i4 = 1
recommendations4.foreach { r =>
println( r.user + ":"+ r.product + ":"+ r.rating+":" + movies(r.product))
i4 += 1
}
sc.stop()
}
/** 校验集预测数据和实际数据之间的均方根误差 **/
//输入训练模型、校验样本、校验个数
def computeRmse(model: MatrixFactorizationModel, data: RDD[Rating], n: Long): Double = {
val predictions = model.predict(data.map(x => (x.user, x.product))) //调用预测的函数
val mapuser = data.map(x => (x.user))
val mapproduct = data.map(x => (x.product))
val maprating = data.map(x => (x.rating))
// 输出predictionsAndRatings预测和评分
val predictionsAndRatings = predictions.map(x => ((x.user, x.product), x.rating))
.join(data.map(x => ((x.user, x.product), x.rating)))
.values
math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce(_ + _) / n)
}
/** 装载用户评分文件 **/
def loadRatings(path: String): Seq[Rating] = {
val lines = Source.fromFile(path).getLines()
val ratings = lines.map { line =>
val fields = line.split("::")
Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble)
}.filter(_.rating > 0.0)
if (ratings.isEmpty) {
sys.error("No ratings provided.")
} else {
ratings.toSeq
}
}
}
修改pom.xml文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>Spark_Recommend</groupId>
<artifactId>Spark_Recommend</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<spark.version>2.1.0</spark.version>
</properties>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-mllib_2.11 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core_2.11 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.39</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-yarn-common -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-common</artifactId>
<version>2.7.4</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.0</version>
<executions>
<execution>
<id>compile-scala</id>
<phase>compile</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>test-compile-scala</id>
<phase>test-compile</phase>
<goals>
<goal>add-source</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>2.11.8</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
</project>
然后,如下图所示,在工程目录窗口内,在pom.xml文件上单击鼠标右键, 在弹出的菜单中选择“Maven”,再在弹出的下一级菜单中选择“Download Sources and Documentation”,把对应的依赖包下载到工程内,首次执行时,这个下载过程需要花费一定的时间,后面再次执行时,速度会快很多。
Q:使用Maven的默认源下载太慢
W:切换成国内源
切换步骤参考教程:https://www.freesion.com/article/9163446230/
idea中打开settings.xml可能只有可读权限,可通过命令行修改settings.xml文件
Q:虚拟机磁盘容量过小
W:
(推荐)法一:也可使用命令行安装gparted-参考教程 gparted分区
法二:磁盘扩容-参考教程 http://www.nxpic.org/module/forum/thread-614446-1-1.html
Q:放大虚拟机窗口
W:点击查看——自动调节大小——自适应客户机