//rdd learnning http://blog.csdn.net/dream_an/article/details/50524340
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.NewEpochRequestProto;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.mllib.linalg.Vector;
import org.codehaus.janino.Java;
import scala.Tuple2;
import scala.annotation.meta.field;
public class spark_join {
public static void main(String[] args) {
// TODO Auto-generated method stub
SparkConf conf=new SparkConf().setAppName("spark_join_test").setMaster("local");
JavaSparkContext sc =new JavaSparkContext(conf);
//处理评级 得到 id ,平均
JavaRDD<String > textfile = sc.textFile("/usr/local/spark/ml-1m/ratings.dat");
JavaPairRDD<Integer, Double> tmp = textfile.mapToPair(
s ->new Tuple2<Integer, Double>( Integer.valueOf(s.split("::")[1]).intValue(),Double.parseDouble(s.split("::")[2])));
//tmp.foreach(a -> System.out.println(a._1+"=>"+a._2));
JavaPairRDD<Integer, Double> tmp1= tmp.reduceByKey((a,b)->(a+b)); //每个电影的总的得分
JavaPairRDD<Integer, Integer> tmp2 = tmp.mapToPair(s -> new Tuple2<Integer, Integer>(s._1, 1));
JavaPairRDD<Integer, Integer> tmp3= tmp2.reduceByKey((a,b)->(a+b)); //每个电影的评分个数
JavaPairRDD<Integer, Tuple2<Double, Integer>> tmp4 = tmp1.join(tmp3) ; //连接两个RDD
JavaPairRDD<Integer, Double>movieScores=tmp4.mapToPair(s->new Tuple2<Integer, Double>(s._1, s._2._1/s._2._2));// //id ,平均评级
//movieScores.foreach(a -> System.out.println(a._1+"=>"+a._2));
//读取movie数据,得到 MovieID::标题::流派
JavaRDD<String> movie= sc.textFile("/usr/local/spark/ml-1m/movies.dat");
JavaPairRDD<Integer, String> movieskey=movie.mapToPair(s->new Tuple2<Integer, String>(Integer.valueOf(s.split("::")[0]).intValue(),s.split("::")[1]));
//movieskey.foreach(a -> System.out.println(a._1+"=>"+a._2));
JavaPairRDD<Integer, Tuple2<Double, String>> result =movieScores.join(movieskey).filter(s-> s._2._1>4.0);
result.foreach(a->System.out.println(a._1+" "+a._2._1+" "+a._2._2));
}
}
输出:
1084 4.096209912536443 Bonnie and Clyde (1967)
3007 4.013559322033898 American Movie (1999)
2493 4.142857142857143 Harmonists, The (1997)
3517 4.5 Bells, The (1926)
1 4.146846413095811 Toy Story (1995)
1780 4.125 Ayn Rand: A Sense of Life (1997)
2351 4.207207207207207 Nights of Cabiria (Le Notti di Cabiria) (1957)
759 4.101694915254237 Maya Lin: A Strong Clear Vision (1994)
1300 4.1454545454545455 My Life as a Dog (Mitt liv som hund) (1985)
1947 4.057818659658344 West Side Story (1961)
2819 4.040752351097178 Three Days of the Condor (1975)
162 4.063136456211812 Crumb (1994)
1228 4.1875923190546525 Raging Bull (1980)
306 4.227544910179641 Three Colors: Red (1994)
1132 4.259090909090909 Manon of the Spring (Manon des sources) (1986)
2132 4.074074074074074 Who's Afraid of Virginia Woolf? (1966)
720 4.426940639269406 Wallace & Gromit: The Best of Aardman Animation (1996)
2917 4.031746031746032 Body Heat (1981)
1066 4.1657142857142855 Shall We Dance? (1937)
2972 4.015384615384615 Red Sorghum (Hong Gao Liang) (1987)
922 4.491489361702127 Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)
446 4.082677165354331 Farewell My Concubine (1993)
1795 4.5 Callej�n de los milagros, El (1995)
2930 4.4 Return with Honor (1998)
1117 4.25 Eighth Day, The (Le Huiti�me jour ) (1996)
1545 4.068493150684931 Ponette (1996)
3435 4.415607985480944 Double Indemnity (1944)
1207 4.425646551724138 To Kill a Mockingbird (1962)
1294 4.124658780709736 M*A*S*H (1970)
2194 4.007985803016859 Untouchables, The (1987)
3022 4.368932038834951 General, The (1927)
858 4.524966261808367 Godfather, The (1972)
1196 4.292976588628763 Star Wars: Episode V - The Empire Strikes Back (1980)
3090 4.141242937853107 Matewan
.......