package cn.spark.study.project.movie;
import java.math.BigDecimal;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
/**
* 电影排行榜实例分析
* @author dahai
*08 20 18:37
*/
public class RDD_Movie_Users_Analyzer {
//数据下载
//https://grouplens.org/datasets/movielens/
//movies.dat,ratings.dat,users.dat
public static void main(String[] args){
//第一步定义spark上下文,设置spark应用信息
SparkConf conf = new SparkConf()
.setAppName("RDD_Movie_Users_Analyzer")
.setMaster("local");
// 第二步:创建JavaSparkcontext对象
JavaSparkContext sc =new JavaSparkContext(conf);
//第三步:
//针对输入源(HDFS):创建一个初始化RDD
JavaRDD<String> ratingsRDD=sc.textFile("C://Users//dahai//Desktop//sparkdata//movie/ratings.dat");
//循环输出数据
/*ratingsRDD.foreach(new VoidFunction<String>() {
private static final long serialVersionUID = 1L;
@Override
public void call(String data) throws Exception {
System.out.println(data);
}
});*/
JavaRDD<String> movieRDD=sc.textFile("C://Users//dahai//Desktop//sparkdata//movie/movies.dat");
//获取所有电影的平均分数
JavaPairRDD<String,Double> aveRDD=getAveMovie(ratingsRDD);
//实现评分最高的前20名电影和平均分
getTop20Ave(aveRDD,movieRDD);
//最后一步关闭连接
sc.close();
}
private static JavaPairRDD<String,Double> getAveMovie(JavaRDD<String> ratingsRDD) {
//第一步获取电影ID总评分和总次数
//1获取电影id和评分数次数
JavaPairRDD<String, Tuple2<Long, Long>> movieAndRatPairRDD =ratingsRDD.mapToPair(new PairFunction<String, String, Tuple2<Long, Long>>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String,Tuple2<Long, Long>> call(String row) throws Exception {
String[] datas =row.split("::");
String movleid=datas[1];
Long rat=Long.valueOf(datas[2]);
return new Tuple2<String,Tuple2<Long,Long>>(movleid,new Tuple2<Long,Long>(rat,1L));
}
});
//测试数据正确性:
/* movieAndRatPairRDD.foreach(new VoidFunction<Tuple2<String,Tuple2<String,Long>>>() {
private static final long serialVersionUID = 1L;
@Override
public void call(Tuple2<String, Tuple2<String, Long>> tuple) throws Exception {
System.out.println(tuple);
}
});
*/
//2 获取电影ID总评分和总次数
JavaPairRDD<String,Tuple2<Long,Long>> data= movieAndRatPairRDD.reduceByKey(new Function2<Tuple2<Long,Long>, Tuple2<Long,Long>, Tuple2<Long,Long>>() {
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public Tuple2<Long, Long> call(Tuple2<Long, Long> tuple1, Tuple2<Long, Long> tuple2) throws Exception {
Long rat1 =tuple1._1;
Long rat2 =tuple2._1;
Long rat =rat1+rat2;
Long count1 =tuple1._2;
Long count2 = tuple2._2;
Long count3 =count1+count2;
return new Tuple2<Long,Long>(rat,count3);
}
});
//(1486,(22,7)) (電影id:1486,(总评分数22,评分了7次)
//(3492,(50,14))(電影id:3492,(总评分数22,评分了14次)
/*data.foreach(new VoidFunction<Tuple2<String,Tuple2<Long,Long>>>() {
private static final long serialVersionUID = 1L;
@Override
public void call(Tuple2<String, Tuple2<Long, Long>> tuple) throws Exception {
System.out.println(tuple);
}
});*/
//第三步获取该电影ID的平均分数
JavaPairRDD<String,Double> ave= data.mapToPair(new PairFunction<Tuple2<String,Tuple2<Long,Long>>, String, Double>() {
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, Double> call(Tuple2<String, Tuple2<Long, Long>> rat) throws Exception {
String movieid =rat._1;
Double result = new BigDecimal((float)rat._2._1 / rat._2._2).setScale(1, BigDecimal.ROUND_HALF_UP).doubleValue();
return new Tuple2<String, Double>(movieid,result);
}
});
return ave;
//电影id.平均分数
//(508,3.9)
/*(2563,3.0)
(1910,3.8)
(1904,3.5)
(3339,3.7)
(1715,3.0)
(605,3.3)
(706,2.5)
(2648,4.0)
(710,2.3)
(1208,4.2)
(1,4.1)
(1313,2.2)
(2005,3.5)
(1259,4.1)*/
/*ave.foreach(new VoidFunction<Tuple2<String,Double>>() {
private static final long serialVersionUID = 1L;
@Override
public void call(Tuple2<String, Double> tuple) throws Exception {
System.out.println(tuple);
}
});*/
}
/**
* 获取评分最高前10名电影和平均分
* @param aveRDD
* @param movieRDD
*/
private static void getTop20Ave(JavaPairRDD<String, Double> aveRDD, JavaRDD<String> movieRDD) {
//第四步:获取电影dat中电影ID,电影名称
JavaPairRDD<String,String> movies=movieRDD.mapToPair(new PairFunction<String, String, String>() {
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, String> call(String line) throws Exception {
String[] datas =line.split("::");
String movieid =datas[0];
String movieName=datas[1];
return new Tuple2<String,String>(movieid,movieName);
}
});
//第五步根据电影ID join 排名信息,得到电影名称和平均分数
JavaPairRDD<Double, String> moviesData=movies.join(aveRDD).mapToPair(new PairFunction<Tuple2<String,Tuple2<String,Double>>, Double, String>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<Double, String> call(Tuple2<String, Tuple2<String, Double>> tuple) throws Exception {
String movieName =tuple._2._1;
Double ave =tuple._2._2;
return new Tuple2<Double,String>(ave,movieName);
}
});
//第六步取top10倒序排序输出
List<Tuple2<Double, String>> moviestop10=moviesData.sortByKey(false).take(20);
for(Tuple2<Double, String> movieTop10 : moviestop10){
String movieName = movieTop10._2;//获取电影名称·
Double ave = movieTop10._1;//获取点评分数
System.out.println(movieName+" 平均点评分数为:"+ave);
}
/*Gate of Heavenly Peace, The (1995) 平均点评分数为:5.0
Song of Freedom (1936) 平均点评分数为:5.0
One Little Indian (1973) 平均点评分数为:5.0
Schlafes Bruder (Brother of Sleep) (1995) 平均点评分数为:5.0
Lured (1947) 平均点评分数为:5.0
Bittersweet Motel (2000) 平均点评分数为:5.0
Follow the Bitch (1998) 平均点评分数为:5.0
Baby, The (1973) 平均点评分数为:5.0
Smashing Time (1967) 平均点评分数为:5.0
Ulysses (Ulisse) (1954) 平均点评分数为:5.0
I Am Cuba (Soy Cuba/Ya Kuba) (1964) 平均点评分数为:4.8
Lamerica (1994) 平均点评分数为:4.8
Apple, The (Sib) (1998) 平均点评分数为:4.7
Shawshank Redemption, The (1994) 平均点评分数为:4.6
Sanjuro (1962) 平均点评分数为:4.6
Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) 平均点评分数为:4.6
Dry Cleaning (Nettoyage � sec) (1997) 平均点评分数为:4.5
Hour of the Pig, The (1993) 平均点评分数为:4.5
Callej�n de los milagros, El (1995) 平均点评分数为:4.5
Skipped Parts (2000) 平均点评分数为:4.5*/
}
}