Point 1:public class MapPartitonsWithIndexOperator {
package com.spark.operator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;
public class MapPartitonsWithIndexOperator {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("MapPartitonsWithIndexOperator").setMaster(
"local[2]");
JavaSparkContext sc = new JavaSparkContext(conf);
// 准备一下数据
List<String> names = Arrays
.asList("xurunyun", "liangyongqi", "wangfei");
JavaRDD<String> nameRDD = sc.parallelize(names,2);
// 其实老师这个地方不写并行度2,默认其实它也是2
// parallelize并行集合的时候,指定了并行度为2,说白了就是numPartitions是2
// 也就是说我们上面的三大女神会被分到不同的两个分区里面去!
// 但是怎么分,我不知道,spark决定!!
// 如果我想知道谁和谁分到了一组里面去?
// MapPartitonsWithIndex这个算子可以拿到每个partition的index
JavaRDD<String> nameWithPartitonIndex = nameRDD.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() {
private static final long serialVersionUID = 1L;
@Override
public Iterator<String> call(Integer index, Iterator<String> iterator)
throws Exception {
List<String> list = new ArrayList<String>();
while(iterator.hasNext()){
String name = iterator.next();
String result = index + " : " + name;
list.add(result);
}
return list.iterator();
}
}, true);
nameWithPartitonIndex.foreach(new VoidFunction<String>() {
private static final long serialVersionUID = 1L;
@Override
public void call(String result) throws Exception {
System.out.println(result);
}
});
sc.close();
}
}
Point 2:MapPartitionsOperator
package com.spark.operator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.VoidFunction;
// 理解里面final使用的原因!
public class MapPartitionsOperator {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("JoinOperator")
.setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
// 准备一下数据
List<String> names = Arrays.asList("xurunyun","liangyongqi","wangfei");
JavaRDD<String> nameRDD = sc.parallelize(names);
final Map<String, Integer> scoreMap = new HashMap<String, Integer>();
scoreMap.put("xurunyun", 150);
scoreMap.put("liangyongqi", 100);
scoreMap.put("wangfei", 90);
// mapPartitions
// map算子,一次就处理一个partition的一条数据!!!
// mapPartitions算子,一次处理一个partition中所有的数据!!!
// 推荐的使用场景!!!
// 如果你的RDD的数据不是特别多,那么采用MapPartitions算子代替map算子,可以加快处理速度
// 比如说100亿条数据,你一个partition里面就有10亿条数据,不建议使用mapPartitions,
// 内存溢出
JavaRDD<Integer> scoreRDD = nameRDD.mapPartitions(new FlatMapFunction<Iterator<String>, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public Iterable<Integer> call(Iterator<String> iterator)
throws Exception {
List<Integer> list = new ArrayList<Integer>();
while(iterator.hasNext()){
String name = iterator.next();
Integer score = scoreMap.get(name);
list.add(score);
}
return list;
}
});
scoreRDD.foreach(new VoidFunction<Integer>() {
private static final long serialVersionUID = 1L;
@Override
public void call(Integer score) throws Exception {
System.out.println(score);
}
});
sc.close();
}
}
Point 3:MapOperator
package com.spark.operator;
import java.util.Arrays;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
public class MapOperator {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("LineCount")
.setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Integer> numbers = Arrays.asList(1,2,3,4,5);
JavaRDD<Integer> numberRDD = sc.parallelize(numbers);
// map对每个元素进行操作
JavaRDD<Integer> results = numberRDD.map(new Function<Integer, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public Integer call(Integer number) throws Exception {
return number * 10;
}
});
results.foreach(new VoidFunction<Integer>() {
private static final long serialVersionUID = 1L;
@Override
public void call(Integer result) throws Exception {
System.out.println(result);
}
});
sc.close();
}
}