我们使用spark java版本的groupByKey对数据进行分组
package com.xzdream.operator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.util.Iterator;
public class GroupByKey {
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setAppName("Map").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> rdd = sc.textFile("file:/Users/zhudechao/gitee/bigdata/xzdream_spark/input/a.txt");
JavaPairRDD<String, Integer> n = rdd.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
String[] a = s.split(" ");
String name = a[0].toString();
Integer num = new Integer(a[1].toString());
return new Tuple2<String, Integer>(name,num);
}
});
JavaPairRDD<String,Iterable<Integer>> groupSource = n.groupByKey();
groupSource.foreach(new VoidFunction<Tuple2<String, Iterable<Integer>>>() {
@Override
public void call(Tuple2<String, Iterable<Integer>> t) throws Exception {
System.out.println(t._1);
Iterator<Integer> ite = t._2.iterator();
while (ite.hasNext()){
System.out.println(ite.next());
}
System.out.println("----------------");
}
});
}
}
输入数据