键值对RDD也就是JavaPairRDD,键值对RDD通常用来进行聚合计算
键值对RDD的创建,及常用操作
/**
* Created by hbin on 2016/12/9.
*/
import java.io.Serializable;
import java.util.Arrays;
import java.util.List;
import breeze.optimize.linear.LinearProgram;
import io.netty.util.internal.StringUtil;
import org.apache.commons.lang.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaDoubleRDD;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.*;
import scala.Tuple2;
/**
* spark对数据的核心抽象 RDD(弹性分布式数据集)
* RDD就是分布式的元素集合,在spark中对数据的所有操作不外乎创建RDD
* 转化已有RDD以及调用RDD操作进行求值,spark会自动将RDD中的数据分发到集群上,
* 并将操作并行化
*/
public class BasicMap {
public static void main(String[] args) throws Exception {
SparkConf sparkConf = new SparkConf().setAppName("JavaSparkPi");
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
JavaRDD<String> RDD1=jsc.parallelize(Arrays.asList("1","2","3","4","5","6","7","8","9"),6);// 并行化
PairFunction<String,String,String> keyData=new PairFunction<String, String, String>() {
@Override
public Tuple2<String, String> call(String s) throws Exception {
return new Tuple2<>(s.split(" ")[0],s);
}
};
JavaPairRDD<String,String> pairs=RDD1.mapToPair(keyData);
JavaPairRDD<String,String> newPairs=pairs.mapValues(new Function<String, String>() {
@Override
public String call(String s) throws Exception {
return s+"*";
}
});
System.out.println("newPairs="+newPairs.collect());
System.out.println(" pairs.collect()="+pairs.collect()+" RDD1="+RDD1.collect());
System.out.println(" groupByKey()="+pairs.groupByKey().collect());
System.out.println("keys()="+pairs.keys().collect());
System.out.println("values()="+pairs.values().collect());
System.out.println("sortByKey()="+pairs.sortByKey().collect());
}
}
执行结果:
newPairs=[(1,1*), (2,2*), (3,3*), (4,4*), (5,5*), (6,6*), (7,7*), (8,8*), (9,9*)]
pairs.collect()=[(1,1), (2,2), (3,3), (4,4), (5,5), (6,6), (7,7), (8,8), (9,9)] RDD1=[1, 2, 3, 4, 5, 6, 7, 8, 9]
groupByKey()=[(6,[6]), (7,[7]), (1,[1]), (8,[8]), (2,[2]), (9,[9]), (3,[3]), (4,[4]), (5,[5])]
keys()=[1, 2, 3, 4, 5, 6, 7, 8, 9]
values()=[1, 2, 3, 4, 5, 6, 7, 8, 9]
sortByKey()=[(1,1), (2,2), (3,3), (4,4), (5,5), (6,6), (7,7), (8,8), (9,9)]
针对两个pair RDD的转化操作
代码示例
/**
* Created by hbin on 2016/12/9.
*/
import java.io.Serializable;
import java.util.Arrays;
import java.util.List;
import breeze.optimize.linear.LinearProgram;
import io.netty.util.internal.StringUtil;
import org.apache.commons.lang.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaDoubleRDD;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.*;
import scala.Tuple2;
/**
* spark对数据的核心抽象 RDD(弹性分布式数据集)
* RDD就是分布式的元素集合,在spark中对数据的所有操作不外乎创建RDD
* 转化已有RDD以及调用RDD操作进行求值,spark会自动将RDD中的数据分发到集群上,
* 并将操作并行化
*/
public class BasicMap {
public static void main(String[] args) throws Exception {
SparkConf sparkConf = new SparkConf().setAppName("JavaSparkPi");
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
JavaRDD<String> RDD1=jsc.parallelize(Arrays.asList("1","2","3","4","5","6","7","8","9"),6);// 并行化
JavaRDD<String> RDD2=jsc.parallelize(Arrays.asList("A","B","C","D","E","F","G","H","J","1","2","3","4","5","6","7"),6);// 并行化
PairFunction<String,String,String> keyData1=new PairFunction<String, String, String>() {
@Override
public Tuple2<String, String> call(String s) throws Exception {
return new Tuple2<>(s.split(" ")[0],s);
}
};
PairFunction<String,String,String> keyData2=new PairFunction<String, String, String>() {
@Override
public Tuple2<String, String> call(String s) throws Exception {
return new Tuple2<>(s.split("")[0],s);
}
};
JavaPairRDD<String,String> pairsRDD1=RDD1.mapToPair(keyData1);
JavaPairRDD<String,String> parirsRDD2=RDD2.mapToPair(keyData2);
System.out.println("pairsRDD1="+pairsRDD1.collect());
System.out.println("pairsRDD2="+parirsRDD2.collect());
System.out.println("pairsRDD1.subtractByKey(parirsRDD2).collect()="+pairsRDD1.subtractByKey(parirsRDD2).collect());
System.out.println("pairsRDD1.join(parirsRDD2)="+pairsRDD1.join(parirsRDD2).collect());
System.out.println("pairsRDD1.rightOuterJoin(parirsRDD2)="+pairsRDD1.rightOuterJoin(parirsRDD2).collect());
System.out.println("pairsRDD1.leftOuterJoin(parirsRDD2)="+pairsRDD1.leftOuterJoin(parirsRDD2).collect());
System.out.println("pairsRDD1.cogroup(parirsRDD2)="+pairsRDD1.cogroup(parirsRDD2).collect());
}
}
执行结果:
pairsRDD1=[(1,1), (2,2), (3,3), (4,4), (5,5), (6,6), (7,7), (8,8), (9,9)]
pairsRDD2=[(A,A), (B,B), (C,C), (D,D), (E,E), (F,F), (G,G), (H,H), (J,J), (1,1), (2,2), (3,3), (4,4), (5,5), (6,6), (7,7)]
pairsRDD1.subtractByKey(parirsRDD2).collect()=[(8,8), (9,9)]
pairsRDD1.join(parirsRDD2)=[(6,(6,6)), (7,(7,7)), (1,(1,1)), (2,(2,2)), (3,(3,3)), (4,(4,4)), (5,(5,5))]
pairsRDD1.rightOuterJoin(parirsRDD2)=[(B,(Optional.absent(),B)), (6,(Optional.of(6),6)), (H,(Optional.absent(),H)), (7,(Optional.of(7),7)), (C,(Optional.absent(),C)), (1,(Optional.of(1),1)), (2,(Optional.of(2),2)), (J,(Optional.absent(),J)), (D,(Optional.absent(),D)), (3,(Optional.of(3),3)), (E,(Optional.absent(),E)), (4,(Optional.of(4),4)), (F,(Optional.absent(),F)), (G,(Optional.absent(),G)), (5,(Optional.of(5),5)), (A,(Optional.absent(),A))]
pairsRDD1.leftOuterJoin(parirsRDD2)=[(6,(6,Optional.of(6))), (7,(7,Optional.of(7))), (1,(1,Optional.of(1))), (8,(8,Optional.absent())), (2,(2,Optional.of(2))), (9,(9,Optional.absent())), (3,(3,Optional.of(3))), (4,(4,Optional.of(4))), (5,(5,Optional.of(5)))]
pairsRDD1.cogroup(parirsRDD2)=[(B,([],[B])), (6,([6],[6])), (H,([],[H])), (7,([7],[7])), (C,([],[C])), (1,([1],[1])), (8,([8],[])), (2,([2],[2])), (J,([],[J])), (D,([],[D])), (9,([9],[])), (3,([3],[3])), (E,([],[E])), (4,([4],[4])), (F,([],[F])), (G,([],[G])), (5,([5],[5])), (A,([],[A]))]