文章地址:http://www.haha174.top/article/details/254163
项目源码:https://github.com/haha174/spark.git
假设 现在有一个文件里面有一组数据
1 5
2 4
3 6
1 3
2 1
实现一个需求。先按照第一列排序如果第一列相同按照第二列排序。
1.自定义二次排序的key 要实现 Ordered 和Serializable 接口 在key 中实现自己对多个列的排序算法。
实现Ordered 接口 需要重写其中的 6个方法:
$greater,$greater$eq,$less,$less$eq,compare,compareTo
重写逻辑如下
public class SecondSoftKey implements Ordered<SecondSoftKey> , Serializable {
public SecondSoftKey(int first, int second) {
this.first = first;
this.second = second;
}
@Override
public boolean $greater(SecondSoftKey that) {
if(this.first>that.getFirst()){
return true;
}else if(this.first==that.getFirst()&&this.second>that.getSecond()){
return true;
}
return false;
}
@Override
public boolean $greater$eq(SecondSoftKey that) {
if(this.$greater(that)){
return true;
}else if(this.getFirst()==that.getFirst()&&this.getSecond()==that.getSecond()){
return true;
}
return false;
}
@Override
public boolean $less(SecondSoftKey that) {
if(this.first<that.getFirst()){
return true;
}else if(this.first==that.getFirst()&&this.second<that.getSecond()){
return true;
}
return false;
}
@Override
public boolean $less$eq(SecondSoftKey that) {
if(this.$less(that)){
return true;
}else if(this.getFirst()==that.getFirst()&&this.getSecond()==that.getSecond()){
return true;
}
return false;
}
@Override
public int compare(SecondSoftKey that) {
if(this.getFirst()-that.getFirst()!=0){
return this.getFirst()-that.getFirst();
}else{
return this.second-that.getSecond();
}
}
@Override
public int compareTo(SecondSoftKey that) {
if(this.getFirst()-that.getFirst()!=0){
return this.getFirst()-that.getFirst();
}else{
return this.second-that.getSecond();
}
}
// 为进行排序的列提供set get equals hashCode
private int first;
private int second;
2.将包含文本的RDD映射成key为自定义key,value为文本的javaPairRDD。
JavaPairRDD<SecondSoftKey,String> pairs=list.mapToPair(new PairFunction<String, SecondSoftKey, String>() {
@Override
public Tuple2<SecondSoftKey, String> call(String s) throws Exception {
String str[]=s.split(" ");
SecondSoftKey secondSoftKey=new SecondSoftKey(Integer.parseInt(str[0]),Integer.parseInt(str[1]));
return new Tuple2<SecondSoftKey, String>(secondSoftKey,s);
}
});
3 .使用softByKey 计算
JavaPairRDD<SecondSoftKey,String> resultPair=pairs.sortByKey();
4.再次映射保留文本RDD
JavaRDD<String> result=resultPair.map(new Function<Tuple2<SecondSoftKey, String>, String>() {
@Override
public String call(Tuple2<SecondSoftKey, String> secondSoftKeyStringTuple2) throws Exception {
return secondSoftKeyStringTuple2._2;
}
});
5.输出即可
result.foreach(new VoidFunction<String>() {
@Override
public void call(String s) throws Exception {
System.out.println(s);
}
});
下面给出scala 示例:
scala 相对于编写代码量要少写只需要重写一个compare 方法即可
class SecondSoftKey(val first:Int,val second:Int) extends Ordered[SecondSoftKey] with Serializable {
override def compare(that: SecondSoftKey): Int = {
if(this.first-that.first!=0){
return this.first-that.first
}else {
return this.second-that.second
}
}
}
object SecondSoftDemo {
def main(args: Array[String]): Unit = {
val sparkConf =new SparkConf().setAppName("SecondSoft").setMaster("local")
val sc=new SparkContext(sparkConf);
val lines=sc.textFile("C:\\Users\\haha174\\Desktop\\data\\test.txt",1)
val pairs=lines.map{line=>(
new SecondSoftKey(line.split(" ")(0).toInt,line.split(" ")(1).toInt),line
)}
val softPairs=pairs.sortByKey();
var softedLines=softPairs.map(line=>line._2)
softedLines.foreach(line=>println(line))
}
}
完整版的代码请见git