Action算子
常用的action操作算子主要如下:
1、reduce
2、collect
3、count
4、take(n)
5、saveAsTextFile
6、countByKey
7、foreach
- java版本
public class ActionOperation {
public static void main(String[] args) {
//reduce();
//collect();
//count();
//take();
//saveAsTextFile();
countByKey();
}
private static void reduce(){
//创建SparkConf
SparkConf conf = new SparkConf().setAppName("reduce").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
//对一个集合中的1-10 10个数字进行累加
List<Integer> numberList = Arrays.asList(1,2,3,4,5,6,7,8,9,10);
JavaRDD<Integer> numbers = sc.parallelize(numberList);
//使用reduce操作对集合中的数字进行累加
//reduce操作原理:
//首先:将第一个和第二个元素,传入call()方法,进行计算,会获取一个结果,如1+2=3
//然后,将结果与下一个元素传入call()方法,进行计算,如3+3=6
//以此类推
//所以reduce操作的本质就是聚合,将多个元素聚合成一个元素
int sum = numbers.reduce(new Function2<Integer, Integer, Integer>() {
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
System.out.println(sum);
//关闭JavaSparkContext
sc.close();
}
/**
*
*/
public static void collect() {
//创建SparkConf
SparkConf conf = new SparkConf().setAppName("collect").setMaster("local");
//创建JavaSparkContext
JavaSparkContext sc = new JavaSparkContext(conf);
//构造集合
List<Integer> numbers = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
//并行化集合,创建初始RDD
JavaRDD<Integer> numberRDD = sc.parallelize(numbers);
JavaRDD<Integer> doubleNumbers = numberRDD.map(new Function<Integer, Integer>() {
//传入call()方法的就是1,2,3,4,5
//返回的就是2,4,6,8,10
public Integer call(Integer v1) throws Exception {
return v1 * 2;
}
});
//不用foreach action操作,在远程集群上遍历rdd中的元素
//而是用collect操作,将分布在远程集群上的doubleNumber RDD的数据拉取到本地
//这种方式一般不建议使用
//因为如果RDD中的数据量比较大,比如超过1万条,那么性能会比较差
//因为要从远程走大量的网络传输,将数据获取到本地
//此外,除了性能差,还可能在RDD中数据量特别大的情况下,发生com异常,内存溢出
//因此,通常还是推荐是用foreach action操作,来对最终的RDD进行处理
List<Integer> doubleNumberList = doubleNumbers.collect();
for (Integer num:doubleNumberList) {
System.out.println(num);
}
//关闭JavaSparkContext
sc.close();
}
public static void count() {
//创建SparkConf
SparkConf conf = new SparkConf().setAppName("count").setMaster("local");
//创建JavaSparkContext
JavaSparkContext sc = new JavaSparkContext(conf);
//构造集合
List<Integer> numbers = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
//并行化集合,创建初始RDD
JavaRDD<Integer> numberRDD = sc.parallelize(numbers);
//对RDD使用count操作,统计它有多少个元素
long count = numberRDD.count();
System.out.println(count);
//关闭JavaSparkContext
sc.close();
}
private static void take(){
//创建SparkConf
SparkConf conf = new SparkConf().setAppName("take").setMaster("local");
//创建JavaSparkContext
JavaSparkContext sc = new JavaSparkContext(conf);
//构造集合
List<Integer> numbers = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
//并行化集合,创建初始RDD
JavaRDD<Integer> numberRDD = sc.parallelize(numbers);
//take操作与collect类似,也是从远程集群上,获取RDD的数据
//但是collect是获取RDD的所有数据,take只是获取前n个数据
List<Integer> top3Numbers = numberRDD.take(3);
for (Integer num :top3Numbers) {
System.out.println(num);
}
//关闭JavaSparkContext
sc.close();
}
private static void saveAsTextFile(){
//创建SparkConf
SparkConf conf = new SparkConf().setAppName("saveAsTextFile").setMaster("local");
//创建JavaSparkContext
JavaSparkContext sc = new JavaSparkContext(conf);
//构造集合
List<Integer> numbers = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
//并行化集合,创建初始RDD
JavaRDD<Integer> numberRDD = sc.parallelize(numbers);
JavaRDD<Integer> doubleNumbers = numberRDD.map(new Function<Integer, Integer>() {
//传入call()方法的就是1,2,3,4,5
//返回的就是2,4,6,8,10
public Integer call(Integer v1) throws Exception {
return v1 * 2;
}
});
//注意:这里只能指定文件夹,也就是目录
doubleNumbers.saveAsTextFile("./saveAsTextFile");
sc.close();
}
private static void countByKey(){
//创建sparkConf
SparkConf conf = new SparkConf().setAppName("groupByKey").setMaster("local");
//创建JavaSparkContext
JavaSparkContext sc = new JavaSparkContext(conf);
//模拟集合
List<Tuple2<String, String>> studentsList = Arrays.asList(
new Tuple2<String, String>("class1", "a"),
new Tuple2<String, String>("class2", "b"),
new Tuple2<String, String>("class1", "c"),
new Tuple2<String, String>("class2", "d")
);
//并行化集合
JavaPairRDD<String, String> students = sc.parallelizePairs(studentsList);
//对RDD应用countByKey操作,统计每个班级的学生人数,也就是统计每个key对应的元素个数
//也就是countByKey的应用
Map<String,Long> studentCounts = students.countByKey();
for(Map.Entry<String,Long> studentCount : studentCounts.entrySet()){
System.out.println(studentCount.getKey() + " : " + studentCount.getValue());
}
//关闭JavaSparkContext
sc.close();
}
}
- scala版本
object ActionOperation_scala {
def main(args: Array[String]): Unit = {
//reduce()
//collect()
//count()
//take()
countByKey()
}
def reduce(): Unit ={
val conf = new SparkConf().setAppName("reduce").setMaster("local")
val sc = new SparkContext(conf)
val numberList = Array(1,2,3,4,5,6,7,8,9,10)
val numberRDD = sc.parallelize(numberList)
val sum = numberRDD.reduce(_+_)
println(sum)
}
def collect(): Unit ={
val conf = new SparkConf().setAppName("collect").setMaster("local")
val sc = new SparkContext(conf)
val numberList = Array(1,2,3,4,5,6,7,8,9,10)
val numbers = sc.parallelize(numberList)
val doubleNumbers = numbers.map{num => num * 2}
val doubleNumbersArray = doubleNumbers.collect()
for(num <- doubleNumbersArray){
println(num)
}
}
def count(): Unit ={
val conf = new SparkConf().setAppName("count").setMaster("local")
val sc = new SparkContext(conf)
val numberList = Array(1,2,3,4,5,6,7,8,9,10)
val numbers = sc.parallelize(numberList)
val count = numbers.count()
println(count)
}
def take(): Unit ={
val conf = new SparkConf().setAppName("take").setMaster("local")
val sc = new SparkContext(conf)
val numberList = Array(1,2,3,4,5,6,7,8,9,10)
val numbers = sc.parallelize(numberList)
val top3Numbers = numbers.take(3);
for (num <- top3Numbers){
println(num)
}
}
def countByKey(): Unit ={
val conf = new SparkConf().setAppName("take").setMaster("local")
val sc = new SparkContext(conf)
val scoreList = Array(Tuple2(1,"90"), Tuple2(2,"100"), Tuple2(2,"89"), Tuple2(1,"60"))
val scores = sc.parallelize(scoreList,1)
val students = scores.countByKey();
println(students)
}