1,每个分区元素个数统计示例代码
List<Tuple2<String, Integer>> partRDD = javaSparkContext.parallelize(Arrays.asList(1, 2, 3, 1, 1))
.mapPartitionsWithIndex(new Function2<Integer, Iterator<Integer>,
Iterator<Tuple2<String, Integer>>>() {
@Override
public Iterator<Tuple2<String, Integer>> call(Integer v1, Iterator<Integer> v2) throws Exception {
List<Tuple2<String, Integer>> list = new ArrayList<Tuple2<String, Integer>>();
int totalElement = 0;
while (v2.hasNext()) {
v2.next();
totalElement++;
}
list.add(new Tuple2<String, Integer>("part_" + v1, totalElement));
return list.iterator();
}
}, true).collect();
for (Tuple2<String, Integer> tuple2 : partRDD) {
System.out.println("分区:" + tuple2._1() + " 数量: " + tuple2._2());
}
2,每个分区元素个数统计输出结果
分区:part_0 数量: 2
分区:part_1 数量: 3
3,每个分区元素统计示例代码
List<Tuple2<String, List<Integer>>> dataList = javaSparkContext.parallelize(Arrays.asList(1, 2, 3, 1, 1))
.mapPartitionsWithIndex(new Function2<Integer, Iterator<Integer>,
Iterator<Tuple2<String, List<Integer>>>>() {
@Override
public Iterator<Tuple2<String, List<Integer>>> call(Integer v1, Iterator<Integer> v2) throws Exception {
List<Tuple2<String, List<Integer>>> list = new ArrayList<Tuple2<String, List<Integer>>>();
List<Integer> elementList = new ArrayList<Integer>();
while (v2.hasNext()) {
elementList.add(v2.next());
}
list.add(new Tuple2<String, List<Integer>>("part_" + v1, elementList));
return list.iterator();
}
}, true).collect();
for (Tuple2<String, List<Integer>> tuple2 : dataList) {
System.out.println("分区:" + tuple2._1() + " 元素: " + tuple2._2());
}
4,每个分区元素统计输出结果
分区:part_0 元素: [1, 2]
分区:part_1 元素: [3, 1, 1]