键值对聚合操作(reduceByKey,foldByKey,sortByKey, join)
1. reduceByKey
def reduceByKey(func: (V, V) => V): RDD[(K, V)]
def reduceByKey(func: (V, V) => V, numPartitions: Int): RDD[(K, V)]
def reduceByKey(partitioner: Partitioner, func: (V, V) => V): RDD[(K, V)]
接收一个函数,按照相同的key进行reduce操作,类似于scala的reduce的操作
例如RDD {(1, 2), (3, 4), (3, 6)}进行reduce
scala版本
var mapRDD = sc.parallelize(List((1,2),(3,4),(3,6)))
var reduceRDD = mapRDD.reduceByKey((x,y)=>x+y)
reduceRDD.foreach(x=>println(x))
------输出---------
(1,2)
(3,10)
再举例
单词计数
F:\sparktest\sample.txt中的内容如下
aa bb cc aa aa aa dd dd ee ee ee ee
ff aa bb zks
ee kks
ee zz zks
scala版本
val lines = sc.textFile("F:\\sparktest\\sample.txt")
val wordsRDD = lines.flatMap(x=>x.split(" ")).map(x=>(x,1))
val wordCountRDD = wordsRDD.reduceByKey((x,y)=>x+y)
wordCountRDD.foreach(x=>println(x))
---------输出-----------
(ee,6)
(aa,5)
(dd,2)
(zz,1)
(zks,2)
(kks,1)
(ff,1)
(bb,2)
(cc,1)
java版本
public class reduceByKeyRDDJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local").setAppName("reduceByKeyJava");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> words = sc.textFile("in/word.txt");
JavaPairRDD<String, Integer> wordsPairRDD = words.flatMapToPair(new PairFlatMapFunction<String, String, Integer>() {
@Override
public Iterator<Tuple2<String, Integer>> call(String s) throws Exception {
ArrayList<Tuple2<String, Integer>> tpLists = new ArrayList<>();
String[] split = s.split(" ");
for (String str :
split) {
Tuple2<String, Integer> tup2 = new Tuple2<>(str, 1);
tpLists.add(tup2);
}
return tpLists.iterator();
}
});
JavaPairRDD<String, Integer> wordCount = wordsPairRDD.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
Map<String, Integer> collectAsMap = wordCount.collectAsMap();
for (String key :
collectAsMap.keySet()) {
System.out.println("("+key+","+collectAsMap.get(key)+")");
}
}
}
2. foldByKey
def foldByKey(zeroValue: V)(func: (V, V) => V): RDD[(K, V)]
def foldByKey(zeroValue: V, numPartitions: Int)(func: (V, V) => V): RDD[(K, V)]
def foldByKey(zeroValue: V, partitioner: Partitioner)(func: (V, V) => V): RDD[(K, V)]
该函数用于RDD[K,V]根据K将V做折叠、合并处理,其中的参数zeroValue表示先根据映射函数将zeroValue应用于V,进行初始化V,再将映射函数应用于初始化后的V.
与reduce不同的是 foldByKey开始折叠的第一个元素不是集合中的第一个元素,而是传入的一个元素
scala版本
object foldByKeyRDDScala {
def main(args: Array[String]): Unit = {
val conf
= new SparkConf().setAppName("foldByKeyScala").setMaster("local")
val sc
= new SparkContext(conf)
val rdd1: RDD[(String, Int)]
=sc.parallelize(List(("A", 2), ("A", 3), ("B", 5), ("B", 8)))
rdd1.foldByKey(0)((x,y)=>{println("one:"+x+"two:"+y);x+y})
.collect.foreach(println)
}
}
3. SortByKey
def sortByKey(ascending : scala.Boolean = { /* compiled code */ }, numPartitions : scala.Int = { /* compiled code */ }) : org.apache.spark.rdd.RDD[scala.Tuple2[K, V]] = { /* compiled code */ }
SortByKey用于对pairRDD按照key进行排序,第一个参数可以设置true或者false,默认是true
scala版本
scala> val rdd = sc.parallelize(Array((3, 4),(1, 2),(4,4),(2,5), (6,5), (5, 6)))
// sortByKey不是Action操作,只能算是转换操作
scala> rdd.sortByKey()
res9: org.apache.spark.rdd.RDD[(Int, Int)] = ShuffledRDD[28] at sortByKey at <console>:24
//看看sortByKey后是什么类型
scala> rdd.sortByKey().collect()
res10: Array[(Int, Int)] = Array((1,2), (2,5), (3,4), (4,4), (5,6), (6,5))
//降序排序
scala> rdd.sortByKey(false).collect()
res12: Array[(Int, Int)] = Array((6,5), (5,6), (4,4), (3,4), (2,5), (1,2))
java版本
public class sortByKeyRDDJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("sortByKeyJava").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
ArrayList<Tuple2<Integer,String>> list = new ArrayList<>();
list.add(new Tuple2<>(5,"hello"));
list.add(new Tuple2<>(4,"world"));
list.add(new Tuple2<>(3,"spark"));
list.add(new Tuple2<>(2,"scala"));
list.add(new Tuple2<>(1,"china"));
JavaRDD<Tuple2<Integer, String>> rdd1 = sc.parallelize(list);
PairFunction<Tuple2<Integer, String>, Integer, String> pairFunction = new PairFunction<Tuple2<Integer, String>, Integer, String>() {
@Override
public Tuple2<Integer, String> call(Tuple2<Integer, String> tup2) throws Exception {
System.out.println("PairFunction" + tup2._1 + " " + tup2._2);
return tup2;
}
};
JavaPairRDD<Integer, String> integerStringJavaPairRDD = rdd1.mapToPair(pairFunction);
JavaPairRDD<Integer, String> integerStringJavaPairRDD1 = integerStringJavaPairRDD.sortByKey(true);
List<Tuple2<Integer, String>> collect = integerStringJavaPairRDD1.collect();
for (Tuple2 tp2 :
collect) {
System.out.println(tp2);
}
}
}
4. groupByKey
def groupByKey(): RDD[(K, Iterable[V])]
def groupByKey(numPartitions: Int): RDD[(K, Iterable[V])]
def groupByKey(partitioner: Partitioner): RDD[(K, Iterable[V])]
groupByKey会将RDD[key,value] 按照相同的key进行分组,形成RDD[key,Iterable[value]]的形式, 有点类似于sql中的groupby,例如类似于mysql中的group_concat
例如这个例子, 我们对学生的成绩进行分组
scala版本
object groupByKeyScala {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("groupByKeyScala").setMaster("local")
val sc = new SparkContext(conf)
val scoreDetail = sc.parallelize(List(
("xiaohei", 67),
("xiaohei", 98),
("xiaomin", 45),
("xiaomin", 69),
("xiaozi", 75),
("xiaozi", 64),
("xiaolan", 85),
("xiaolan", 96)
))
scoreDetail.groupByKey().collect.foreach(println)
}
}
java版本
public class groupByKeyJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local").setAppName("groupByKeyJava");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<Tuple2<String,Integer>> scoreDetail = sc.parallelize(Arrays.asList(
new Tuple2("xiaohei", 67),
new Tuple2("xiaohei", 98),
new Tuple2("xiaomin", 45),
new Tuple2("xiaomin", 69),
new Tuple2("xiaozi", 75),
new Tuple2("xiaozi", 64),
new Tuple2("xiaolan", 85),
new Tuple2("xiaolan", 96)
));
JavaPairRDD<String,Integer> scoreMapRDD = JavaPairRDD.fromJavaRDD(scoreDetail);
Map<String, Iterable<Integer>> resultMap = scoreMapRDD.groupByKey().collectAsMap();
for (String key :
resultMap.keySet()) {
System.out.println("("+key+","+resultMap.get(key)+")");
}
}
}
5. cogroup
groupByKey是对单个 RDD 的数据进行分组,还可以使用一个叫作 cogroup() 的函数对多个共享同一个键的 RDD 进行分组
例如
RDD1.cogroup(RDD2) 会将RDD1和RDD2按照相同的key进行分组,得到(key,RDD[key,Iterable[value1],Iterable[value2]])的形式
cogroup也可以多个进行分组
例如RDD1.cogroup(RDD2,RDD3,…RDDN), 可以得到(key,Iterable[value1],Iterable[value2],Iterable[value3],…,Iterable[valueN])
案例,scoreDetail存放的是学生的优秀学科的分数,scoreDetai2存放的是刚刚及格的分数,scoreDetai3存放的是没有及格的科目的分数,我们要对每一个学生的优秀学科,刚及格和不及格的分数给分组统计出来
scala版本
object cogroupRDDScala {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("cogroupScala").setMaster("local")
val sc = new SparkContext(conf)
val score1 = sc.parallelize(List(("xiaomin",97),("xiaomin",90),("lihua",98),("lihua",76)))
val score2 = sc.parallelize(List(("xiaomin",34),("lihua",56),("lihua",73),("xiaofeng",86)))
val score3 = sc.parallelize(List(("xiaofeng",78),("lihua",90),("lihua",73),("xiaofeng",86)))
score1.cogroup(score2,score3).foreach(println)
}
}
java版本
public class cogroupRDDJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("cogroupJava").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<Tuple2<String,Integer>> score1 = sc.parallelize(Arrays.asList(new Tuple2("xiaoming", 98),
new Tuple2("xiaoming", 93),
new Tuple2("xiaobai", 88),
new Tuple2("xiaobai", 58),
new Tuple2("xiaolan", 58),
new Tuple2("xiaolan", 68)
));
JavaRDD<Tuple2<String,Integer>> score2 = sc.parallelize(Arrays.asList(new Tuple2("xiaolan", 56),
new Tuple2("xiaolan", 67),
new Tuple2("xiaoming", 58),
new Tuple2("xiaoming", 38),
new Tuple2("xiaobai", 78),
new Tuple2("xiaobai", 48)
));
JavaRDD<Tuple2<String,Integer>> score3 = sc.parallelize(Arrays.asList(new Tuple2("xiaozi", 56),
new Tuple2("xiaozi", 67),
new Tuple2("xiaoming", 18),
new Tuple2("xiaoming", 28),
new Tuple2("xiaobai", 58),
new Tuple2("xiaobai", 28)
));
JavaPairRDD<String,Integer > scoreMapRDD1 = JavaPairRDD.fromJavaRDD(score1);
JavaPairRDD<String,Integer > scoreMapRDD2 = JavaPairRDD.fromJavaRDD(score2);
JavaPairRDD<String,Integer > scoreMapRDD3 = JavaPairRDD.fromJavaRDD(score3);
JavaPairRDD<String, Tuple3<Iterable<Integer>, Iterable<Integer>, Iterable<Integer>>> cogroupRDD
= (JavaPairRDD<String, Tuple3<Iterable<Integer>, Iterable<Integer>, Iterable<Integer>>>)
scoreMapRDD1.cogroup(scoreMapRDD2, scoreMapRDD3);
Map<String, Tuple3<Iterable<Integer>, Iterable<Integer>, Iterable<Integer>>> Tuple3Map = cogroupRDD.collectAsMap();
for (String key :
Tuple3Map.keySet()) {
System.out.println("("+key+","+Tuple3Map.get(key)+")");
}
}
}
6. subtractByKey
函数定义
def subtractByKey[W](other: RDD[(K, W)])(implicit arg0: ClassTag[W]): RDD[(K, V)]
def subtractByKey[W](other: RDD[(K, W)], numPartitions: Int)(implicit arg0: ClassTag[W]): RDD[(K, V)]
def subtractByKey[W](other: RDD[(K, W)], p: Partitioner)(implicit arg0: ClassTag[W]): RDD[(K, V)]
类似于subtrac,删掉 RDD 中键与 other RDD 中的键相同的元素
7. join,fullOuterJoin, rightOuterJoin, leftOuterJoin
scala版本
object KVguanjian {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("KVguanjian")
val sc = new SparkContext(conf)
val rdd = sc.makeRDD(Array((1,2),(3,4),(5,6)))
val other = sc.makeRDD(Array((3,9)))
//rdd.subtractByKey(other).collect.foreach(println)
//rdd.join(other).collect.foreach(println)
//rdd.leftOuterJoin(other).collect.foreach(println)
rdd.rightOuterJoin(other).collect.foreach(println)
}
}
java版本
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.Optional;
import scala.Tuple2;
import java.util.Arrays;
import java.util.Map;
public class KVguanjianJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local").setAppName("KVguanjianJava");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<Tuple2<Integer,Integer>> rddPre = sc.parallelize(Arrays.asList(new Tuple2(1, 2),
new Tuple2(3, 4),
new Tuple2(5, 6)
));
JavaRDD<Tuple2<Integer,Integer>> otherPre = sc.parallelize(Arrays.asList(new Tuple2(3, 10),
new Tuple2(4, 8)
));
//JavaRDD转换成JavaPairRDD
JavaPairRDD<Integer, Integer> rdd = JavaPairRDD.fromJavaRDD(rddPre);
JavaPairRDD<Integer, Integer> other = JavaPairRDD.fromJavaRDD(otherPre);
JavaPairRDD<Integer, Integer> subtractRDD = rdd.subtractByKey(other);
JavaPairRDD<Integer, Tuple2<Integer, Integer>> joinRDD = (JavaPairRDD<Integer, Tuple2<Integer, Integer>>) rdd.join(other);
JavaPairRDD<Integer, Tuple2<Optional<Integer>, Optional<Integer>>> fullOutJoinRDD = (JavaPairRDD<Integer, Tuple2<Optional<Integer>, Optional<Integer>>>) rdd.fullOuterJoin(other);
JavaPairRDD<Integer, Tuple2<Integer, Optional<Integer>>> leftOutJoinRDD = (JavaPairRDD<Integer, Tuple2<Integer, Optional<Integer>>>) rdd.leftOuterJoin(other);
JavaPairRDD<Integer, Tuple2<Optional<Integer>, Integer>> rightOuterJoinRDD = (JavaPairRDD<Integer, Tuple2<Optional<Integer>, Integer>>) rdd.rightOuterJoin(other);
Map<Integer, Integer> subMap = subtractRDD.collectAsMap();
System.out.println("----------subRDD");
for (Integer key :
subMap.keySet()) {
System.out.println("subRDD:"+key+","+subMap.get(key));
}
Map<Integer, Tuple2<Integer, Integer>> joinMap = joinRDD.collectAsMap();
System.out.println("----------joinMap");
for (Integer key :
joinMap.keySet()) {
System.out.println("joinMap:"+key+","+joinMap.get(key));
}
Map<Integer, Tuple2<Optional<Integer>, Optional<Integer>>> fullOutJoinMap = fullOutJoinRDD.collectAsMap();
System.out.println("----------fullOutJoinMap");
for (Integer key :
fullOutJoinMap.keySet()) {
System.out.println("fullOutJoinMap:"+key+","+fullOutJoinMap.get(key));
}
Map<Integer, Tuple2<Integer, Optional<Integer>>> leftOutJoinMap = leftOutJoinRDD.collectAsMap();
System.out.println("----------leftOutJoinMap");
for (Integer key :
leftOutJoinMap.keySet()) {
System.out.println("leftOutJoinMap:"+key+","+leftOutJoinMap.get(key));
}
Map<Integer, Tuple2<Optional<Integer>, Integer>> rightOuterJoinMap = rightOuterJoinRDD.collectAsMap();
System.out.println("----------rightOutJoinMap");
for (Integer key:
rightOuterJoinMap.keySet()) {
System.out.println("rightOuterJoinRDD:"+key+","+rightOuterJoinMap.get(key));
}
}
}