Spark Core基础RDD操作【map,flatMap,mapPartitions,flatMapToPair】
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.3.1</version>
</dependency>
</dependencies>
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
public class SparkCoreDemo {
private static String appName = "spark.demo";
private static String master = "local[*]";
public static void main(String args[]){
//初始化 JavaSparkContext
SparkConf conf = new SparkConf().setAppName(appName).setMaster(master);
JavaSparkContext sc = new JavaSparkContext(conf);
// 构造数据源
List<Integer> data = Arrays.asList(1, 2, 3, 4, 5);
//并行化创建rdd
JavaRDD<Integer> rdd = sc.parallelize(data,3);
//map
List<Integer> list_rdd = rdd.map(new Function<Integer,Integer>() {
public Integer call(Integer i) throws Exception {
i = i + 1;
return i;
}
}).collect();
//map
for(Integer i : list_rdd){
System.out.println(i);
}
//flatMap
JavaRDD<Integer> rdd0 = rdd.flatMap(new FlatMapFunction<Integer, Integer>(){
public Iterator<Integer> call(Integer integer) throws Exception {
List l = new ArrayList();
l.add(integer);
return l.iterator();
}
});
System.out.println("flatMap: "+rdd0.collect());
//mapPartitions
JavaRDD<Integer[]> rdd1 = rdd.mapPartitions(new FlatMapFunction<Iterator<Integer>, Integer[]>(){
public Iterator<Integer[]> call(Iterator<Integer> integerIterator) throws Exception {
List l = new ArrayList();
Integer temp = 0;
while(integerIterator.hasNext()){
Integer i = integerIterator.next();
temp = temp + i;
}
l.add(temp);
return l.iterator();
}
});
System.out.println("mapPartitions: "+ rdd1.collect());
//flatMapToPair
JavaPairRDD<Integer,Integer> rdd2 = rdd.flatMapToPair(new PairFlatMapFunction<Integer,Integer,Integer>(){
public Iterator<Tuple2<Integer, Integer>> call(Integer integer) throws Exception {
List<Tuple2<Integer, Integer>> resultTuple = new ArrayList<Tuple2<Integer, Integer>>();
System.out.println(integer);
resultTuple.add(new Tuple2<Integer, Integer>(integer,integer));
return resultTuple.iterator();
}
});
System.out.println("mapPartitionsToPair: "+ rdd2.collect());
}
}
result
2
3
4
5
6
flatMap: [1, 2, 3, 4, 5]
mapPartitions: [1, 5, 9]
mapPartitionsToPair: [(1,1), (2,2), (3,3), (4,4), (5,5)]