reduceByKey
聚合操作
def reduceByKey(func :
org.apache.spark.api.java.function.Function2[V, V, V]) :
org.apache.spark.api.java.JavaPairRDD[K, V] =
{ /* compiled code */ }
Scala版
import org.apache.spark.{SparkConf, SparkContext}
object reduceByKey {
def main(args: Array[String]): Unit = {
val conf=new SparkConf().setMaster("local[1]").setAppName("demo")
val sc = new SparkContext(conf)
val rdd = sc.textFile("data/words.txt")
val result = rdd.flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_)
result.collect.foreach(println)
}
}
Java版:
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
public class reduceByKey {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[1]").setAppName("reduceByKey");
JavaSparkContext sc = new JavaSparkContext(conf);
//todo 文件导入
JavaRDD<String> words = sc.textFile("data/words.txt");
JavaPairRDD<String, Integer> wordsPair = words.flatMapToPair(new PairFlatMapFunction<String, String, Integer>() {
@Override
public Iterator<Tuple2<String, Integer>> call(String s) throws Exception {
ArrayList<Tuple2<String, Integer>> list = new ArrayList<>();
String[] splits = s.split("\\s+");
for (int i = 0; i < splits.length; i++) {
String split = splits[i];
Tuple2 t2 = new Tuple2<String, Integer>(split, 1);
list.add(t2);
}
return list.iterator();
}
});
// List<Tuple2<String, Integer>> collect = wordsPair.collect();
// for (Tuple2<String, Integer> tuple2 : collect) {
// System.out.println(tuple2);
// }
JavaPairRDD<String, Integer> reduceByKey = wordsPair.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer i1, Integer i2) throws Exception {
return i1 + i2;
}
});
List<Tuple2<String, Integer>> collect = reduceByKey.collect();
for (Tuple2<String, Integer> tuple2 : collect) {
System.out.println(tuple2);
}
}
}
foldByKey
Scala版本
object foldByKey {
def main(args: Array[String]): Unit = {
val conf=new SparkConf().setMaster("local[1]").setAppName("demo")
val sc = new SparkContext(conf)
val rdd = sc.textFile("data/words.txt")
val result = rdd.flatMap(_.split(" ")).map((_,1)).foldByKey(1)(_+_)
result.collect.foreach(println)
}
}
sortByKey
按照key排序。区内排序
sortByKey() 默认升序,sortByKey(false) 降序。
Scala版本:
import org.apache.spark.{SparkConf, SparkContext}
object sortByKey {
def main(args: Array[String]): Unit = {
val conf=new SparkConf().setMaster("local[1]").setAppName("demo")
val sc = new SparkContext(conf)
val rdd=sc.makeRDD(List((3,"c"),(1,"a"),(2,"b"),(5,"e"),(4,"d")))
rdd.sortByKey().collect.foreach(println)
}
}
Java版本
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class SortByKey {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[1]").setAppName("sortByKey");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Tuple2<Integer, String>> list = new ArrayList<>();
list.add(new Tuple2<>(3, "sam"));
list.add(new Tuple2<>(1, "sally"));
list.add(new Tuple2<>(4, "john"));
list.add(new Tuple2<>(2, "mary"));
list.add(new Tuple2<>(5, "bruce"));
list.add(new Tuple2<>(6, "sana"));
JavaRDD<Tuple2<Integer, String>> rdd = sc.parallelize(list);
PairFunction<Tuple2<Integer, String>, Integer, String> pairFunction =
new PairFunction<Tuple2<Integer, String>, Integer, String>() {
@Override
public Tuple2<Integer, String> call(Tuple2<Integer, String> t) throws Exception {
return t;
}
};
//sortByKey() 默认升序,sortByKey(false) 降序
JavaPairRDD<Integer, String> sort = rdd.mapToPair(pairFunction).sortByKey(false);
List<Tuple2<Integer, String>> collect = sort.collect();
for (Tuple2<Integer, String> tup2 : collect) {
System.out.println(tup2);
}
}
}