spark支持两种RDD操作:transformation和action操作。
transformation操作会针对已有RDD创建一个新的RDD,而action则对RDD进行最后的操作,如遍历、保存到文件等,并将结果返回到Driver程序。
transformation有lazy特性:若一个spark程序只定义了transformation操作,即使执行了该程序,那些操作也不会执行。
action操作则会触发一个spark job的运行,从而触发action前面所有的transformation的执行。
常用transformation:
map:遍历RDD的每一个元素,通过自定义方法对其进行操作,获得一个新的元素,最终组成一个新的RDD。
/**
* 将集合中每一个元素乘2
*/
private static void map(){
SparkConf conf = new SparkConf().setAppName("map").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Integer> numbers = Arrays.asList(1,2,3,4,5);
//并行化集合,创建初始RDD
JavaRDD<Integer> numberRDD = sc.parallelize(numbers);
//使用map算子,将集合中的每个元素都乘2
//Function的第二个泛型参数类型需自己设置,这个是返回的新元素的类型,call方法返回的参数类型需与之相同
//call方法中进行计算处理,返回新的元素,这些新的元素会组成新的RDD
JavaRDD<Integer> multipleNumberRDD = numberRDD.map(new Function<Integer, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public Integer call(Integer v1) throws Exception {
return v1 * 2;
}
});
//打印新的RDD
multipleNumberRDD.foreach(new VoidFunction<Integer>() {
private static final long serialVersionUID = 1L;
@Override
public void call(Integer t) throws Exception {
System.out.println(t);
}
});
sc.close();
}
filter:对RDD的每个元素进行自定义判断,返回true的保留,false的删除。
/**
* 过滤集合中的偶数
*/
private static void filter(){
SparkConf conf = new SparkConf().setAppName("filter").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Integer> numbers = Arrays.asList(1,2,3,4,5,6,7,8,9,10);
//并行化集合,创建初始RDD
JavaRDD<Integer> numberRDD = sc.parallelize(numbers);
//使用filter算子,过滤出集合中的偶数
//call方法返回boolean值,若想在新的RDD中保留这个元素就返回true,否则返回false
JavaRDD<Integer> evenNumberRDD = numberRDD.filter(new Function<Integer, Boolean>() {
private static final long serialVersionUID = 1L;
@Override
public Boolean call(Integer v1) throws Exception {
return v1 % 2 == 0;
}
});
//打印新的RDD
evenNumberRDD.foreach(new VoidFunction<Integer>() {
private static final long serialVersionUID = 1L;
@Override
public void call(Integer t) throws Exception {
System.out.println(t);
}
});
sc.close();
}
flatMap:与map类似,但对每个元素都可返回一个或多个新的元素。
/**
* 将行拆分为单词
*/
private static void flatMap(){
SparkConf conf = new SparkConf().setAppName("flatMap").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List<String> lineList = Arrays.asList("hello you","hello me","hello world");
JavaRDD<String> lines = sc.parallelize(lineList);
//将每一行文本,拆分成多个单词
//flatMap接收的参数是FlatMapFunction,需自定义第二个泛型参数,此参数代表了返回的新元素的类型;
//call方法返回的类型是Iterator<U>,此处U与flatMap的第二个参数类型相同;
//flatMap:接收原始RDD的每个元素,在call方法中进行各种计算和处理,返回多个元素;
//多个元素封装在Iterator集合中,可使用ArrayList等集合;
//新的RDD中封装了所有的新元素,新的RDD大小大于原始RDD
JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
private static final long serialVersionUID = 1L;
@Override
public Iterator<String> call(String t) throws Exception {
return Arrays.asList(t.split(" ")).iterator();
}
});
words.foreach(new VoidFunction<String>() {
private static final long serialVersionUID = 1L;
@Override
public void call(String v1) throws Exception {
System.out.println(v1);
}
});
sc.close();
}
groupByKey:根据key进行分组,每个key对应一个Iterable<value>。
/**
* 将每个班级的成绩进行分组
*/
private static void groupByKey(){
SparkConf conf = new SparkConf().setAppName("froupByKey").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Tuple2<String,Integer>> scoreList = Arrays.asList(
new Tuple2<String, Integer>("class01",80),
new Tuple2<String, Integer>("class02",81),
new Tuple2<String, Integer>("class01",82),
new Tuple2<String, Integer>("class02",83));
//并行化集合,创建JavaPairRDD
JavaPairRDD<String,Integer> scores = sc.parallelizePairs(scoreList);
//使用groupByKey对每个班级的成绩进行分组
//使用groupByKey返回的是一个JavaPairRDD,第一个泛型类型不变,第二个变成Iterable这种集合类型,
//即,按照key进行分组,每个key可能有多个value,此时多个value聚合成了iterable。
JavaPairRDD<String, Iterable<Integer>> groupedScore = scores.groupByKey();
groupedScore.foreach(new VoidFunction<Tuple2<String,Iterable<Integer>>>() {
private static final long serialVersionUID = 1L;
@Override
public void call(Tuple2<String, Iterable<Integer>> t) throws Exception {
System.out.println(t._1);
Iterator<Integer> ite = t._2.iterator();
while(ite.hasNext()){
System.out.println(ite.next());
}
System.out.println("**********************************");
}
});
sc.close();
}
reduceByKey:对每个key对应的value进行reduce操作。
/**
* 统计每个班级的总分
*/
private static void reduceByKey(){
SparkConf conf = new SparkConf().setAppName("reduceByKey").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Tuple2<String,Integer>> scoreList = Arrays.asList(
new Tuple2<String, Integer>("class01",80),
new Tuple2<String, Integer>("class02",81),
new Tuple2<String, Integer>("class01",82),
new Tuple2<String, Integer>("class02",83));
JavaPairRDD<String, Integer> scores = sc.parallelizePairs(scoreList);
//reduceByKey,接收的参数是Function2类型,它有三个泛型参数,代表了三个值;
//第一二个泛型类型,代表了原始的RDD中的元素的value的类型
//第三个泛型的类型,代表了每次reduce操作返回值的类型,默认也是与原始RDD的value类型相同
JavaPairRDD<String, Integer> totalScores = scores.reduceByKey(new Function2<Integer, Integer, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
totalScores.foreach(new VoidFunction<Tuple2<String,Integer>>() {
private static final long serialVersionUID = 1L;
@Override
public void call(Tuple2<String, Integer> t) throws Exception {
System.out.println(t._1 + ":" + t._2);
}
});
sc.close();
}
sortByKey:对每个key对应的value进行排序操作。
/**
* 将学生分数进行排序
*/
private static void sortByKey(){
SparkConf conf = new SparkConf().setAppName("sortByKey").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Tuple2<Integer, String>> scoreList = Arrays.asList(
new Tuple2<Integer, String>(65,"leo1"),
new Tuple2<Integer, String>(45,"leo2"),
new Tuple2<Integer, String>(85,"leo3"),
new Tuple2<Integer, String>(67,"leo4")
);
JavaPairRDD<Integer, String> scores = sc.parallelizePairs(scoreList);
//使用sortByKey进行排序,无参为升序,false为降序
JavaPairRDD<Integer, String> sortedScores = scores.sortByKey(false);//false为降序
sortedScores.foreach(new VoidFunction<Tuple2<Integer,String>>() {
private static final long serialVersionUID = 1L;
@Override
public void call(Tuple2<Integer, String> t) throws Exception {
System.out.println(t._1 + ":" + t._2);
}
});
sc.close();
}
join:对两个包含<key,value>对的RDD进行join操作,每个key对应的pair都会传入自定义函数进行处理。
/**
* 打印每个学生的成绩
*/
private static void join(){
SparkConf conf = new SparkConf().setAppName("join").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Tuple2<Integer, String>> studentList = Arrays.asList(
new Tuple2<Integer, String>(1,"leo1"),
new Tuple2<Integer, String>(2,"leo2"),
new Tuple2<Integer, String>(3,"leo3"),
new Tuple2<Integer, String>(4,"leo4")
);
List<Tuple2<Integer, Integer>> scoreList = Arrays.asList(
new Tuple2<Integer, Integer>(1,100),
new Tuple2<Integer, Integer>(2,88),
new Tuple2<Integer, Integer>(3,75),
new Tuple2<Integer, Integer>(4,97)
);
//并行化两个RDD
JavaPairRDD<Integer, String> students = sc.parallelizePairs(studentList);
JavaPairRDD<Integer, Integer> scores = sc.parallelizePairs(scoreList);
JavaPairRDD<Integer, Tuple2<String, Integer>> studentScore = students.join(scores);
//使用join算子关联两个RDD
//join以后,还是会根据key进行join,并返回JavaPairRDD,
//但是JavaPairRDD的第一个泛型类型,是之前两个JavaPairRDD的key的类型,因为是通过key进行join的
//第二个泛型类型,是Tuple2<v1,v2>的类型,Tuple2的两个泛型分别为原始RDD的value的类型
studentScore.foreach(new VoidFunction<Tuple2<Integer,Tuple2<String,Integer>>>() {
private static final long serialVersionUID = 1L;
@Override
public void call(Tuple2<Integer, Tuple2<String, Integer>> t) throws Exception {
System.out.println("student id: " + t._1);
System.out.println("student name: " + t._2._1);
System.out.println("student score: " + t._2._2);
System.out.println("*****************************");
}
});
sc.close();
}
cogroup:同join,但是每个key对应的Iterable<value>都会传入自定义函数进行处理。
/**
* 打印每个学生的成绩
*/
private static void cogroup(){
SparkConf conf = new SparkConf().setAppName("cogroup").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Tuple2<Integer, String>> studentList = Arrays.asList(
new Tuple2<Integer, String>(1,"leo1"),
new Tuple2<Integer, String>(2,"leo2"),
new Tuple2<Integer, String>(3,"leo3"),
new Tuple2<Integer, String>(4,"leo4")
);
List<Tuple2<Integer, Integer>> scoreList = Arrays.asList(
new Tuple2<Integer, Integer>(1,100),
new Tuple2<Integer, Integer>(2,88),
new Tuple2<Integer, Integer>(3,75),
new Tuple2<Integer, Integer>(4,97),
new Tuple2<Integer, Integer>(1,80),
new Tuple2<Integer, Integer>(2,68),
new Tuple2<Integer, Integer>(3,95),
new Tuple2<Integer, Integer>(4,57)
);
//并行化两个RDD
JavaPairRDD<Integer, String> students = sc.parallelizePairs(studentList);
JavaPairRDD<Integer, Integer> scores = sc.parallelizePairs(scoreList);
//cogroup与join不同
//cogroup相当于一个key join上的所有value,都放在一个Iterable里
JavaPairRDD<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> studentScore = students.cogroup(scores);
studentScore.foreach(new VoidFunction<Tuple2<Integer,Tuple2<Iterable<String>,Iterable<Integer>>>>() {
private static final long serialVersionUID = 1L;
@Override
public void call(Tuple2<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> t) throws Exception {
System.out.println("student id: " + t._1);
System.out.println("student name: " + t._2._1);
System.out.println("student score: " + t._2._2);
System.out.println("*****************************");
}
});
sc.close();
}
常用action操作:
reduce:将RDD的所有元素进行聚合操作。
/**
* 使用reduce进行累加
*/
private static void reduce(){
SparkConf conf = new SparkConf().setAppName("reduce").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Integer> numberList = Arrays.asList(1,2,3,4,5,6,7,8,9,10);
JavaRDD<Integer> numbers = sc.parallelize(numberList);
//使用reduce操作对集合中的数字进行累加
//reduce操作原理:
//首先将第一个和第二个元素传入call()方法,进行计算获得一个结果,
//再将该结果与下一个元素传入call()方法,进行计算,以此类推
//所以reduce就是聚合,将多个元素聚合成一个元素
int count = numbers.reduce(new Function2<Integer, Integer, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
System.out.println(count);
sc.close();
}
collect:将RDD中所有元素获取到本地客户端。
private static void collect(){
SparkConf conf = new SparkConf().setAppName("collect").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Integer> numberList = Arrays.asList(1,2,3,4,5,6,7,8,9,10);
JavaRDD<Integer> numbers = sc.parallelize(numberList);
JavaRDD<Integer> doubleNumbers = numbers.map(new Function<Integer, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public Integer call(Integer t) throws Exception {
return t * 2;
}
});
//不用foreach action操作,在远程集群上遍历RDD的元素
//而使用collect操作,将分布在远程集群上的doubleNumbers RDD的数据拉取到本地
//性能差,若数据量大,会造成内存溢出,所以,不建议使用,推荐使用foreach
List<Integer> doubleNumberList = doubleNumbers.collect();
for(Integer num:doubleNumberList){
System.out.println(num);
}
sc.close();
}
count:获取RDD元素总数。
private static void count(){
SparkConf conf = new SparkConf().setAppName("count").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Integer> numberList = Arrays.asList(1,2,3,4,5,6,7,8,9,10);
JavaRDD<Integer> numbers = sc.parallelize(numberList);
//对RDD进行count操作,统计有多少元素
long count = numbers.count();
System.out.println(count);
sc.close();
}
take(n):获取RDD前n个元素。
private static void take(){
SparkConf conf = new SparkConf().setAppName("take").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Integer> numberList = Arrays.asList(1,2,3,4,5,6,7,8,9,10);
JavaRDD<Integer> numbers = sc.parallelize(numberList);
//take操作,与collect类似,也是从远程集群上获取RDD的数据
//但是collect是获取RDD的所有数据,take只是获取前n个数据
List<Integer> top3Number = numbers.take(3);
for(Integer num : top3Number){
System.out.println(num);
}
sc.close();
}
saveAsTextFile:将RDD元素保存到文件中,对每个元素调用toString方法。
private static void saveAsTextFile(){
SparkConf conf = new SparkConf().setAppName("saveAsTextFile");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Integer> numberList = Arrays.asList(1,2,3,4,5,6,7,8,9,10);
JavaRDD<Integer> numbers = sc.parallelize(numberList);
JavaRDD<Integer> doubleNumbers = numbers.map(new Function<Integer, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public Integer call(Integer t) throws Exception {
return t * 2;
}
});
//直接将RDD中的数据保存在HDFS文件中
doubleNumbers.saveAsTextFile("/data/double_number.txt");
sc.close();
}
countByKey:对每个key对应的值进行计数。
private static void countByKey(){
SparkConf conf = new SparkConf().setAppName("countByKey").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Tuple2<String, String>> studentList = Arrays.asList(
new Tuple2<String, String>("class02","leo1"),
new Tuple2<String, String>("class01","leo2"),
new Tuple2<String, String>("class02","leo3"),
new Tuple2<String, String>("class01","leo4")
);
//并行化两个RDD
JavaPairRDD<String, String> students = sc.parallelizePairs(studentList);
//统计每个班的学生人数,也就是统计每个key对应的元素个数
Map<String, Long> studentCounts = students.countByKey();
for(Map.Entry<String, Long> studentCount : studentCounts.entrySet()){
System.out.println(studentCount.getKey() + ":" + studentCount.getValue());
}
sc.close();
}
foreach:遍历RDD的每个元素。