Spark RDD Action算子的基本使用(Java)
最近在总结Spark RDD相关算子的使用,列出了一些基本使用方法,可供大家参考,快速上手。
package com.edward.spark.core;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.*;
import java.util.*;
import org.apache.spark.api.java.function.Function2;
import scala.Tuple2;
import java.io.Serializable;
public class ApiTestAction {
private static SparkConf conf = new SparkConf()
.setMaster("local[1]")
.setAppName("ApiTest");
private static JavaSparkContext jsc = new JavaSparkContext(conf);
/**
* count(),返回元素的个数,返回元素类型是long
*/
private static void api_count()
{
List<Integer> data =Arrays.asList(1,2,3,4,5,11,23);
List<String> data2 = Arrays.asList("Edward","CiCi","Della","Mystique");
JavaRDD<Integer> rdd =jsc.parallelize(data,3);
JavaRDD<String> rdd2 =jsc.parallelize(data2,2);
long value =rdd2.count();
System.out.println(value);
}
/**
* 返回每个value的个数,返回类型是map<value,count>
*/
private static void api_countByValue()
{
Map<Integer,Long> map =new HashMap<>();
List<Integer> data =Arrays.asList(1,2,3,4,5,11,23);
List<String> data2 = Arrays.asList("Edward","CiCi","Della","Mystique");
JavaRDD<Integer> rdd =jsc.parallelize(data,1);
JavaRDD<String> rdd2 =jsc.parallelize(data2,2);
map=rdd.countByValue();
map.forEach((k,v)->{
System.out.println("key:" + k + " value:" + v);
});
}
/**
* 返回key的个数,返回类型是map<key,count> ;只有JavaPairRDD才有
*/
private static void api_countByKey()
{
Map<Integer,Long> map =new HashMap<>();
Map<Tuple2<Integer,String>,Long> map2 =new HashMap<>();
List<Tuple2<Integer,String>> data3 =Arrays.asList(new Tuple2<>(1,"Edward"), new Tuple2<>(1,"CiCi"),new Tuple2<>(3,"Della"));
JavaPairRDD<Integer,String> rdd3=jsc.parallelizePairs(data3,3);
map=rdd3.countByKey();
map2=rdd3.countByValue();
map.forEach((k,v)->{
System.out.println("key:" + k + " count:" + v);
});
map2.forEach((k,v)->{
System.out.println("value:" + k + " count:" + v);
});
}
/**
* 返回元素的最大值,按传入的comparator排序
*/
private static void api_max()
{
List<Integer> data =Arrays.asList(1,2,3,4,5,11,23);
List<String> data2 = Arrays.asList("Edward","CiCi","Della","Mystique");
JavaRDD<Integer> rdd =jsc.parallelize(data,3);
JavaRDD<String> rdd2 =jsc.parallelize(data2,2);
String value2 =rdd2.max(Comparator.naturalOrder());
int value =rdd.max(Comparator.naturalOrder());
System.out.println("value=" + value);
System.out.println("value2=" + value2);
}
/**
* 返回元素的最小值,按传入的comparator排序
*/
private static void api_min()
{
List<Integer> data =Arrays.asList(1,2,3,4,5,11,23);
List<String> data2 = Arrays.asList("Edward","CiCi","Della","Mystique");
JavaRDD<Integer> rdd =jsc.parallelize(data,3);
JavaRDD<String> rdd2 =jsc.parallelize(data2,2);
String value2 =rdd2.max(Comparator.naturalOrder());
int value =rdd.min(Comparator.naturalOrder());
System.out.println("value=" + value);
System.out.println("value2=" + value2);
}
/**
* 返回元素的第一个
*/
private static void api_first()
{
List<Integer> data =Arrays.asList(1,2,3,4,5,11,23);
List<String> data2 = Arrays.asList("Edward","CiCi","Della","Mystique");
JavaRDD<Integer> rdd =jsc.parallelize(data,3);
JavaRDD<String> rdd2 =jsc.parallelize(data2,2);
String firstValue =rdd2.first();
System.out.println(firstValue);
}
/**
* 返回所有数据集的所有元素,返回类型是List<T> ;
* 仅做测试或者是返回较少的数据集,因为是存到内存里面的
*/
private static void api_collect()
{
List<Integer> data =Arrays.asList(1,2,3,4,5,11,23);
List<String> data2 = Arrays.asList("Edward","CiCi","Della","Mystique");
JavaRDD<Integer> rdd =jsc.parallelize(data,3);
JavaRDD<String> rdd2 =jsc.parallelize(data2,2);
List<Integer> output =rdd.collect();
System.out.println(output);
}
/**
* 返回对应key的value值,返回类型是List<T>
*/
private static void api_lookup()
{
Map<Integer,Long> map =new HashMap<>();
Map<Tuple2<Integer,String>,Long> map2 =new HashMap<>();
List<Tuple2<Integer,String>> data3 =Arrays.asList(new Tuple2<>(1,"Edward"), new Tuple2<>(1,"CiCi"),new Tuple2<>(3,"Della"));
JavaPairRDD<Integer,String> rdd3=jsc.parallelizePairs(data3,3);
List<String> out =rdd3.lookup(1);
System.out.println(rdd3.lookup(1));
}
/**
* 返回对应num数量的元素,返回类型是List<T>
* 这是一个分区一个分区扫描的,如果涉及多个分区则会比较慢
* 适合数据量不大的情况,因为数据存在内存中
*/
private static void api_take()
{
List<Integer> data =Arrays.asList(1,2,3,4,5,11,23);
List<String> data2 = Arrays.asList("Edward","CiCi","Della","Mystique");
JavaRDD<Integer> rdd =jsc.parallelize(data,3);
JavaRDD<String> rdd2 =jsc.parallelize(data2,2);
List<String> list =rdd2.take(3);
System.out.println(list);
}
private static class TestComparator implements Serializable,Comparator<Tuple2<String, Integer>>{
@Override
public int compare(Tuple2<String, Integer> o1, Tuple2<String, Integer> o2) {
return o1._2.compareTo(o2._2);
}
}
private static class TestComparator2 implements Serializable,Comparator<Integer>{
@Override
public int compare(Integer o1, Integer o2) {
return o1.compareTo(o2);
}
}
private static class TestCompartor3 implements Serializable,Comparator<String>{
@Override
public int compare(String o1, String o2) {
return o1.compareTo(o2);
}
}
/**
* 返回按传入的比较器的最小的N个元素,不传就是默认的自然排序
* 适合数据量不大的情况,因为数据是存在内存中
*/
private static void api_takeOrdered()
{
List<Integer> data =Arrays.asList(1,2,3,4,5,11,23);
List<String> data2 = Arrays.asList("Edward","CiCi","Della","Mystique");
JavaRDD<Integer> rdd =jsc.parallelize(data,3);
JavaRDD<String> rdd2 =jsc.parallelize(data2,2);
List<Integer> output =rdd.takeOrdered(2);
System.out.println(output);
System.out.println(rdd2.takeOrdered(2, new TestCompartor3()));
System.out.println(rdd.takeOrdered(3,new TestComparator2()));
}
/**
* 返回按传入的比较器排序的最大的N个元素,不传就是按自然排序排序
* 适合数据量不大的情况,因为数据是存在内存中
*/
private static void api_top()
{
List<Tuple2<String, Integer>> pairs = Arrays.asList(new Tuple2<>("A", 1),
new Tuple2<>("A", 2), new Tuple2<>("B", 1));
JavaPairRDD< String, Integer> rdd = jsc.parallelizePairs(pairs, 3);
List<Tuple2<String, Integer>> result = rdd.top(2, new TestComparator());
System.out.println(result);
}
/**
* 返回固定长度的随机抽样的结果,takeSample有3个参数,
* withReplacement:是否参与重复抽样(抽取是否放回),true:抽取后放回(可能会出现重复) false:抽取后不放回(不会出现重复)
* num:返回结果长度
* seed:随机数生成器的种子(算法生成的随机数都是伪随机,因为是通过一个可确定的函数,加上一个种子(常用时钟),来生成的;种子又是根据一个初始种子(系统默认的是8682522807148012L)和一个变态的数进行运算而得;这个参数一般不传)
*/
private static void api_takeSample()
{
List<Integer> data =Arrays.asList(1,2,3,4,5,11,23);
List<String> data2 = Arrays.asList("Edward","CiCi","Della","Mystique");
JavaRDD<Integer> rdd =jsc.parallelize(data,3);
JavaRDD<String> rdd2 =jsc.parallelize(data2,2);
List<Integer> out=rdd.takeSample(false,3);
System.out.println(out);
}
/**
* 返回key-value的Map
*/
private static void api_collectAsMap()
{
List<Tuple2<Integer,String>> data =Arrays.asList(new Tuple2<>(1,"Edward"), new Tuple2<>(1,"CiCi"),new Tuple2<>(3,"Della"));
JavaPairRDD<Integer,String> rdd =jsc.parallelizePairs(data,2);
Map<Integer,String> map=rdd.collectAsMap();
System.out.println(map);
}
/**
* 三个参数,zeroValue:初始值 ;seqOp:把每个分区上的元素聚合一次;comOp:把每个分区的聚合结果在聚合一次
* 下面的聚合步骤为:(0,0) ->(0+1,0+1)=(1,1) ->(1+1,1+1) =(2,2) ->(2+2,2+1)=(4,3) ->(4+3,3+1)=(7,4)
* 返回类型跟输入类型可以不一致
*/
private static void api_aggregate()
{
List<Integer> data = Arrays.asList(1, 1, 2, 3);
JavaRDD<Integer> rdd = jsc.parallelize(data, 2);
Tuple2<Integer, Integer> result = rdd.aggregate(new Tuple2<>(0,0),
(x,y)->new Tuple2<>(x._1+y, x._2+1),
(x,y)->new Tuple2<>(x._1+y._1, x._2+y._2));
System.out.println(result);
int result2 =rdd.aggregate(0, (x,y)-> (x+y), (x,y)-> (x+y));
System.out.println(result2);
}
/**
* 聚合每个分区的值
* 跟aggregate的区别就是返回类型;fold要求的返回类型必须一致
*/
private static void api_fold()
{
List<Integer> data = Arrays.asList(1, 1, 2, 3);
JavaRDD<Integer> rdd = jsc.parallelize(data, 2);
int result =rdd.fold(0,(x,y)->(x+y));
System.out.println(result);
}
/**
* 聚合函数
* 跟fold的区别就是没有初始值,返回的类型也必须一致
*/
private static void api_reduce()
{
List<Integer> data = Arrays.asList(1, 1, 2, 3);
JavaRDD<Integer> rdd = jsc.parallelize(data, 2);
int result =rdd.reduce((x,y)->(x+y));
System.out.println(result);
}
/**
* 对每个元素执行对应的操作
*/
private static void api_foreach()
{
List<Integer> data = Arrays.asList(1, 1, 2, 3);
JavaRDD<Integer> rdd = jsc.parallelize(data, 2);
rdd.foreach(x->System.out.println(x));
}
/**
* 对分区内的每个元素执行对应的操作
* 跟foreach的区别:函数内如果有数据库,网络TCP等IO连接,用foreachPartition针对每个分区集合进行运算,可以节省性能开销
*/
private static void api_foreachPartition()
{
List<Integer> data = Arrays.asList(1, 1, 2, 3);
JavaRDD<Integer> rdd = jsc.parallelize(data, 2);
rdd.foreachPartition(x->x.forEachRemaining(y->System.out.println(y)));
}
private static void action_type1()
{
api_count();
api_countByValue();
api_countByKey();
api_max();
api_min();
}
private static void action_type2()
{
api_first();
api_collect();
api_collectPartitions();
api_lookup();
api_take();
api_takeOrdered();
api_top();
api_takeSample();
api_collectAsMap();
}
private static void action_type3()
{
api_aggregate();
api_fold();
api_reduce();
}
private static void action_type4()
{
api_foreach();
api_foreachPartition();
}
public static void main(String[] args) {
action_type1();
action_type2();
action_type3();
action_type4();
jsc.stop();
}
}