本文旨在描述一种思想,并不具有实际意义。
本文将电影评分表ratings与电影基本信息表movies连接起来。
其中ratings表中包含四列(0-3),第1,2列是电影id与评分。movies包含3列(0-2),第0,1列为电影id和电影名。
首先对ratings表进行抽样,对抽样数据进行操作取出其中id值倾斜最严重的一列的id值,根据此id将ratings表分为两个RDD:一个为只含有倾斜id的表,一个为不含有倾斜id的表。将这个两个表分别与movies连接后,将得到的结果合并。
代码如下:
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
public class SkewJoin
{
public static void main(String[] args)
{
SparkConf conf=new SparkConf().setAppName("SkewJoin")
.setMaster("spark://master:7077")
.setJars(new String[]{"hdfs://master:9000/user/hadoop/SkewJoin.jar"});
JavaSparkContext jsc=new JavaSparkContext(conf);
JavaRDD<String> ratings=jsc.textFile("hdfs://master:9000/user/hadoop/ml-1m/ratings.dat");
JavaPairRDD<Integer, Double> rate=ratings.mapToPair(new PairFunction<String,Integer,Double>()
{
@Override
public Tuple2<Integer, Double> call(String t)
throws Exception
{
// TODO Auto-generated method stub
String[] fields=t.split("::");
return new Tuple2<Integer, Double>(Integer.parseInt(fields[1]),Double.parseDouble(fields[2]));
}
});
JavaPairRDD<Integer, Double> sample=rate.sample(false, 0.1);
int maxkey=sample.mapToPair(new PairFunction< Tuple2<Integer, Double>,Integer,Integer>()
{
@Override
public Tuple2<Integer, Integer> call(
Tuple2<Integer, Double> t) throws Exception
{
// TODO Auto-generated method stub
return new Tuple2<Integer, Integer>(t._1,1);
}
}).reduceByKey((a,b)->a+b).mapToPair(new PairFunction< Tuple2<Integer, Integer>,Integer,Integer>()
{
@Override
public Tuple2<Integer, Integer> call(
Tuple2<Integer, Integer> t)
throws Exception
{
// TODO Auto-generated method stub
return new Tuple2<Integer, Integer>(t._2,t._1);
}
})
.sortByKey(false).take(1).get(0)._2;
System.out.println(maxkey);
JavaPairRDD<Integer,String> movies=
jsc.textFile("hdfs://master:9000/user/hadoop/ml-1m/movies.dat")
.mapToPair(new PairFunction<String,Integer,String>()
{
@Override
public Tuple2<Integer, String> call(String t)
throws Exception
{
// TODO Auto-generated method stub
String[] fields=t.split("::");
return new Tuple2<Integer, String>(Integer.parseInt(fields[0]),fields[1]);
}
});
JavaPairRDD<Integer,Tuple2<Double,String>> join1=rate.filter(new Function<Tuple2<Integer,Double>,Boolean>()
{
@Override
public Boolean call(Tuple2<Integer, Double> v1)
throws Exception
{
// TODO Auto-generated method stub
if(v1._1==maxkey) return true;
return false;
}
}
).join(movies);
JavaPairRDD<Integer,Tuple2<Double,String>> join2=rate.filter(new Function<Tuple2<Integer,Double>,Boolean>()
{
@Override
public Boolean call(Tuple2<Integer, Double> v1)
throws Exception
{
// TODO Auto-generated method stub
if(v1._1==maxkey) return false;
return true;
}
}
).join(movies);
System.out.println(jsc.union(join2, join1).take(500));
}
}