countByKey
def countByKey(): Map[K, Long]
scala例子
val conf = new SparkConf().setMaster("local[*]").setAppName("CountByKey_CollectAsMap_Scala")
val sc = new SparkContext(conf)
//scala公共代码
val rdd = sc.parallelize(List((1,2),(2,4),(2,5),(3,1),(3,9),(3,6)))
val countByKeyRdd = rdd.countByKey()
println(countByKeyRdd)
//结果 Map(1 -> 1, 2 -> 2, 3 -> 3)
java例子
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("countByKey_collectAsMap_Java");
JavaSparkContext sc = new JavaSparkContext(conf);
//java公共代码
JavaRDD<Tuple2<Integer, Integer>> rdd = sc.parallelize(Arrays.asList(new Tuple2<>(1, 2),
new Tuple2<>(2, 4),
new Tuple2<>(2, 5),
new Tuple2<>(3, 1),
new Tuple2<>(3, 9),
new Tuple2<>(3, 6)));
JavaPairRDD<Integer, Integer> pairRdd = JavaPairRDD.fromJavaRDD(rdd);
Map<Integer, Long> countByKeyMap = pairRdd.countByKey();
for (Integer key :
countByKeyMap.keySet()) {
System.out.println(key+" : "+countByKeyMap.get(key));
}
/*结果
1 : 1
2 : 2
3 : 3
*/
collectAsMap
将pair类型(键值对类型)的RDD转换成map, 还是上面的例子
scala例子
val rdd = sc.parallelize(List((1,2),(2,4),(2,5),(3,1),(3,9),(3,6)))
val map: collection.Map[Int, Int] = rdd.collectAsMap()
println(map)
//结果 Map(2 -> 5, 1 -> 2, 3 -> 6)
Java例子
JavaRDD<Tuple2<Integer, Integer>> rdd = sc.parallelize(Arrays.asList(new Tuple2<>(1, 2),
new Tuple2<>(2, 4),
new Tuple2<>(2, 5),
new Tuple2<>(3, 1),
new Tuple2<>(3, 9),
new Tuple2<>(3, 6)));
JavaPairRDD<Integer, Integer> pairRdd = JavaPairRDD.fromJavaRDD(rdd);
Map<Integer, Long> collectAsMap = pairRdd.collectAsMap();