Spark——action

1、算子

reduce 将RDD中的所有元素进行聚合操作。第一个和第二个元素聚合,值与第三个元素聚合,值与第四个元素聚合,以此类推
collect 将RDD中所有元素获取到本地客户端
count 获取RDD元素总数
take(n) 获取RDD中前n个元素
saveAsTextFile 将RDD元素保存到文件中,对每个元素调用toString方法
countByKey 对每个key对应的值进行count计数
foreach 遍历RDD中的每个元素

2、例子
① reduce

案例1:求和(java)

package cn.spark.study.core;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;

import java.util.Arrays;
import java.util.List;

public class Action_8 {
    public static void main(String[] args) {
        reduce();
    }
    
    private static void reduce(){
        SparkConf conf = new SparkConf()
                .setMaster("local")
                .setAppName("reduce");
        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Integer> numList = Arrays.asList(1, 2, 3, 45, 6, 7, 8, 9, 10);

        JavaRDD<Integer> numRDD = sc.parallelize(numList);

        Integer sum = numRDD.reduce(new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer integer, Integer integer2) throws Exception {
                return integer + integer2;
            }
        });
        System.out.println(sum);
        sc.close();
    }
}

在这里插入图片描述
案例1:求和(java)

package cn.spark.study.core

import org.apache.spark.{SparkConf, SparkContext}

object Action {
  def main(args: Array[String]): Unit = {
    myReduce()
  }
  def myReduce(): Unit ={
    val  conf = new SparkConf()
      .setMaster("local")
      .setAppName("reduce")
    val sc = new SparkContext(conf)

    val number = Array(1,2,3,4,5,6,7,8,9,10)
    val numbers = sc.parallelize(number)
    val sum: Int = numbers.reduce(_+_)
    println(sum)
  }
}

在这里插入图片描述

② collect

案例1:对每个元素乘以2(java)

package cn.spark.study.core;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;

import java.util.Arrays;
import java.util.List;

public class Action_8 {
    public static void main(String[] args) {
        collect();
    }  
    private static void collect(){
        SparkConf conf = new SparkConf()
                .setMaster("local")
                .setAppName("collect");
        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Integer> numList = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);

        JavaRDD<Integer> numRDD = sc.parallelize(numList);

        JavaRDD<Integer> numc2 = numRDD.map(new Function<Integer, Integer>() {
            @Override
            public Integer call(Integer integer) throws Exception {
                return integer * 2;
            }
        });

        List<Integer> collect = numc2.collect();
        for (Integer a : collect){
            System.out.println(a);
        }
        sc.close();
    }
}  

在这里插入图片描述
案例2:对每个元素乘以2(scala)

package cn.spark.study.core

import org.apache.spark.{SparkConf, SparkContext}

object Action {
  def main(args: Array[String]): Unit = {
	myCollect()
  }
  def myCollect(): Unit ={
    val  conf = new SparkConf()
      .setMaster("local")
      .setAppName("reduce")
    val sc = new SparkContext(conf)

    val number = Array(1,2,3,4,5,6,7,8,9,10)
    val numbers = sc.parallelize(number)
    val doubleNum = numbers.map(_*2)
    val ints: Array[Int] = doubleNum.collect()
    for(num <- ints) {
      println(num)
    }
  }
}

在这里插入图片描述

③ count

案例1:获取RDD元素总数(java)

package cn.spark.study.core;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;

import java.util.Arrays;
import java.util.List;

public class Action_8 {
    public static void main(String[] args) {
        count();
    }
    private static void count(){
        //创建SparkConf
        SparkConf conf = new SparkConf()
                .setMaster("local")
                .setAppName("count");

        //创建SparkContext
        JavaSparkContext sc = new JavaSparkContext(conf);

        // 累加集合中的1-10
        List<Integer> numberList = Arrays.asList(1,2,3,4,5,6,7,8,9,10);
        JavaRDD<Integer> numbers = sc.parallelize(numberList);
        long count = numbers.count();
        System.out.println(count);
    }
}

在这里插入图片描述
案例2:获取RDD元素总数(scala)

package cn.spark.study.core

import org.apache.spark.{SparkConf, SparkContext}

object Action {
  def main(args: Array[String]): Unit = {
	myCount()
  }
  def myCount(): Unit ={
    val  conf = new SparkConf()
      .setMaster("local")
      .setAppName("reduce")
    val sc = new SparkContext(conf)

    val number = Array(1,2,3,4,5,6,7,8,9,10)
    val numbers = sc.parallelize(number)
    val count = numbers.count()
    println(count)
  }
}

在这里插入图片描述

④ take(n)

案例1:获取前三个数(java)

package cn.spark.study.core;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;

import java.util.Arrays;
import java.util.List;

public class Action_8 {
    public static void main(String[] args) {
        take();
    }
    private static void take(){
     //创建SparkConf
     SparkConf conf = new SparkConf()
             .setMaster("local")
             .setAppName("take");

     //创建SparkContext
     JavaSparkContext sc = new JavaSparkContext(conf);

     // 累加集合中的1-10
     List<Integer> numberList = Arrays.asList(1,2,3,4,5,6,7,8,9,10);

     JavaRDD<Integer> numbers = sc.parallelize(numberList);

     //take操作是从远程集群上获取RDD中的数据,但是只获取前n个
     List<Integer> take = numbers.take(3);
     for (Integer a : take) {
         System.out.println(a);
     }
     sc.close();
 	}
}

在这里插入图片描述
案例2:获取前三个数(scala)

package cn.spark.study.core

import org.apache.spark.{SparkConf, SparkContext}

object Action {
  def main(args: Array[String]): Unit = {
	myTake()
  }
  def myTake(): Unit ={
    val  conf = new SparkConf()
      .setMaster("local")
      .setAppName("take")
    val sc = new SparkContext(conf)

    val number = Array(1,2,3,4,5,6,7,8,9,10)
    val numbers = sc.parallelize(number)
    val take: Array[Int] = numbers.take(3)
    for(num <- take) {
      println(num)
    }
  }
}

在这里插入图片描述

⑤ saveAsTextFile

将RDD元素保存到文件中,对每个元素调用toString方法

package cn.spark.study.core;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;

import java.util.Arrays;
import java.util.List;
public class Action_8 {
    public static void main(String[] args) {
        saveAsTextFile();
    }
    private static void saveAsTextFile(){
        //创建SparkConf
        SparkConf conf = new SparkConf()
                .setMaster("local")
                .setAppName("saveAsTextFile");

        //创建SparkContext
        JavaSparkContext sc = new JavaSparkContext(conf);

        // 累加集合中的1-10
        List<Integer> numberList = Arrays.asList(1,2,3,4,5,6,7,8,9,10);

        JavaRDD<Integer> numbers = sc.parallelize(numberList);

        JavaRDD<Integer> numc3 = numbers.map(new Function<Integer, Integer>() {
            @Override
            public Integer call(Integer integer) throws Exception {
                return integer * 3;
            }
        });

        numc3.saveAsTextFile("hdfs://hadoop1:9000/numc3.txt");

        sc.close();
    }
}
⑥ countByKey

案例1:统计每班学生数量(java)

package cn.spark.study.core;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;

import java.util.Arrays;
import java.util.List;
import java.util.Map;

public class Action_8 {
    public static void main(String[] args) {
        countByKey();
    }
    private static void countByKey(){
        //创建SparkConf
        SparkConf conf = new SparkConf()
                .setMaster("local")
                .setAppName("countByKey");

        //创建SparkContext
        JavaSparkContext sc = new JavaSparkContext(conf);

        //模拟集合
        List<Tuple2<String,String>> studentList = Arrays.asList(

                new Tuple2<String,String>("class1","leo"),
                new Tuple2<String,String>("class2","jack"),
                new Tuple2<String,String>("class1","marry"),
                new Tuple2<String,String>("class2","tom"),
                new Tuple2<String,String>("class2","da")
        );
        // 并行化集合,创建JavaPairRDD
        JavaPairRDD<String, String> student = sc.parallelizePairs(studentList);

        // 对rdd应用countByKey操作,统计每个班级学生人数,也就是统计每个key对应的元素个数
        // 这句是countByKey的作用
        Map<String, Object> stringObjectMap = student.countByKey();

        for (Map.Entry<String,Object> a : stringObjectMap.entrySet()){
            System.out.println(a.getKey()+":"+a.getValue());
        }

        sc.close();
    }
}

在这里插入图片描述
案例2:统计每班学生数量(scala)

package cn.spark.study.core

import org.apache.spark.{SparkConf, SparkContext}

object Action {
  def main(args: Array[String]): Unit = {
	myCountByKey()
  }
  def myCountByKey(): Unit ={
    val  conf = new SparkConf()
      .setMaster("local")
      .setAppName("countByKey")
    val sc = new SparkContext(conf)

    val studentList = Array(
      Tuple2("class1","leo"),
      Tuple2("class2","jack"),
      Tuple2("class1","tom"),
      Tuple2("class2","jen"),
      Tuple2("class2","marry"))

    val scoreRDD = sc.parallelize(studentList)
    val stringToLong: collection.Map[String, Long] = scoreRDD.countByKey()

    for((key,value) <- stringToLong){
      println(key+":"+value)
    }
  }
}

在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值