Spark2.x中,当某列保存的数据是一个结构数组时,没有一种漂亮的取出方式,只能通过rdd去取。
以下是用Spark构建协同过滤实现推荐引擎的一个例子(https://spark.apache.org/docs/2.4.5/ml-collaborative-filtering.html),例子中的数据只能通过如下方式取出。
object RecommendationProducts {
def main(args: Array[String]) {
val spark = SparkSession.builder.master("local[2]").appName("UserRecommendation").getOrCreate()
val sqlContext = spark.sqlContext
import sqlContext.implicits._
val ratings = FeatureExtraction.getFeatures();
val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2))
// Build the recommendation model using ALS on the training data
val als = new ALS()
.setMaxIter(5)
.setRegParam(0.01)
.setUserCol("userId")
.setItemCol("movieId")
.setRatingCol("rating")
val model = als.fit(training)
// Evaluate the model by computing the RMSE on the test data
// Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
model.setColdStartStrategy("drop")
val predictions = model.transform(test)
val evaluator = new RegressionEvaluator()
.setMetricName("rmse")
.setLabelCol("rating")
.setPredictionCol("prediction")
val rmse = evaluator.evaluate(predictions)
// Generate top 10 movie recommendations for each user
val userRecs = model.recommendForAllUsers(10)
// Generate top 10 user recommendations for each movie
val movieRecs = model.recommendForAllItems(10)
val users = ratings.select(als.getUserCol).filter("userId=" + 789).distinct()
val userSubsetRecs = model.recommendForUserSubset(users, 10)
userSubsetRecs.printSchema()
val content = userSubsetRecs.select($"recommendations").rdd.map(_.getSeq[Row](0))
val recs = content.map(_.map( row => (
row.getAs[Int]("movieId"),
row.getAs[Float]("rating"),
)))
//recs.foreach(println)
recs.foreach(r => r.foreach(rr => println("movieId: " + rr._1 + " rating: " + rr._2)))
println("done.")
}
}