RDD操作案例

RDD操作案例
map、flatMap、filter、groupByKey、reduceByKey、join、cogroup
join 两个集合基于Key进行内容的连接。

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

object Transformations {
  def main(args:Array[String]){

    val conf = new SparkConf().setAppName("Transformations").setMaster("local") //创建SparkConf初始化程序的配置
    val sc = new SparkContext(conf)  //创建SparkContext,这是第一个RDD创建的唯一入口,是通往集群的唯一通道

    val nums = sc.parallelize(1 to 10)  //根据集合创建RDD
    val mapped = nums.map(_*2)  //map适用于任何类型的元素,且对其作用的集合中的每一个元素循环遍历,并调用其作为参数的函数对每一个遍历的元素进行具体化操作
    mapped.collect().foreach(println)  

    val filtered = nums.filter(_%2 == 0)
    filtered.collect().foreach(println)

    val bigData = Array("Scala Spark", "Java Hadoop", "Java Tachyon")
    val bigDataString = sc.parallelize(bigData)
    val words = bigDataString.flatMap(_.split(" "))
    words.collect().foreach(println)

    sc.stop()
  }
}

main方法中调用的每一个功能都必须模块化,每个模块可以使用函数封装。

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

object Transformations {
  def main(args:Array[String]){
    val sc = sparkContext("Transformation Operations") //创建SparkContext
    mapTransformation(sc)
    filterTransformation(sc)
    flatMapTransformation(sc)
    reduceByKeyTransformation(sc)
    joinTransformation(sc)
    sc.stop()
  }

  def sparkContext(name:String)={
    val conf = new SparkConf().setAppName("Transformations").setMaster("local") //创建SparkConf初始化程序的配置
    val sc = new SparkContext(conf)  //创建SparkContext,这是第一个RDD创建的唯一入口,是通往集群的唯一通道
    sc
  }

  def mapTransformation(sc: SparkContext){
    val nums = sc.parallelize(1 to 10)  //根据集合创建RDD
    val mapped = nums.map(_*2)

    mapped.collect().foreach(println)
  }

  def filterTransformation(sc: SparkContext){
    val nums = sc.parallelize(1 to 20)  //根据集合创建RDD
    val filtered = nums.filter(_%2 == 0)  //根据filter中作为参数的函数的布尔值判断符合条件的元素并基于这些元素构成新的mapPartition RDD
    filtered.collect().foreach(println)
  }

  def flatMapTransformation(sc: SparkContext){
    val bigData = Array("Scala Spark", "Java Hadoop", "Java Tachyon") //实例化字符串类型的Array
    val bigDataString = sc.parallelize(bigData)  //创建以字符串为元素类型的ParallelCollectionRDD
    val words = bigDataString.flatMap(_.split(" ")) //首先通过传入的作为参数的函数来作用域RDD的每个字符串进行单词切分(以集合形式存在),然后把切分后的结果合并成一个大的集合
    words.collect().foreach(println)
  }

  def groupByKeyTransformation(sc: SparkContext){
    val data = Array(Tuple2(100, "Spark"), Tuple2(100, "Tachyon"), Tuple2(70, "Hadoop"), Tuple2(80, "HBase"))
    val dataRDD= sc.parallelize(data) //创建RDD
    val grouped = dataRDD.groupByKey() //按照相同的key对value进行分组,分组后的value是一个集合
    grouped.collect().foreach(println)
  }
  def reduceByKeyTransformation(sc: SparkContext){
    val lines = sc.textFile("words.txt")
    val wordCount = lines.flatMap(_.split(" ")).map(word => (word,1)).reduceByKey(_+_).collect().foreach(println)
  }
  def joinTransformation(sc: SparkContext){
    val studentNames = Array(
      Tuple2(1, "Spark"),
      Tuple2(2, "Tachyon"),
      Tuple2(3, "Hadoop")
    )
    val studentScores = Array(
      Tuple2(1, 100),
      Tuple2(2, 95),
      Tuple2(3, 90)
    )
    val names = sc.parallelize(studentNames)
    val scores = sc.parallelize(studentScores)

   val studentInformation = names.join(scores)
   studentInformation.collect().foreach(println)
   //(1,(Spark,100))
   //(3,(Hadoop,90))
   //(2,(Tachyon,95))
  }
}
//Java实现cogroup操作
import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction;

import scala.Tuple2;

public class CogroupOps {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setAppName("Cogroup Transformation").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Tuple2<Integer, String>> namesList = Arrays.asList(
                new Tuple2<Integer, String>(1, "Spark"),
                new Tuple2<Integer, String>(2, "Tachyon"),
                new Tuple2<Integer, String>(3, "Hadoop")
                );
        List<Tuple2<Integer, Integer>> scoresList = Arrays.asList(
                new Tuple2<Integer, Integer>(1, 100),
                new Tuple2<Integer, Integer>(2, 90),
                new Tuple2<Integer, Integer>(3, 70),
                new Tuple2<Integer, Integer>(1, 110),
                new Tuple2<Integer, Integer>(2, 95),
                new Tuple2<Integer, Integer>(2, 60)
                );
        JavaPairRDD<Integer, String> names = sc.parallelizePairs(namesList);
        JavaPairRDD<Integer, Integer> scores = sc.parallelizePairs(scoresList);

        JavaPairRDD<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> nameScores = names.cogroup(scores);
        nameScores.foreach(new VoidFunction<Tuple2<Integer,Tuple2<Iterable<String>,Iterable<Integer>>>>() {

            @Override
            public void call(Tuple2<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> t) throws Exception {
                System.out.println("Student ID:" + t._1);
                System.out.println("Name:" + t._2._1);
                System.out.println("Score:" + t._2._2);

                System.out.println("===========");
            }
        });
        sc.close();
    }
}

/*
Student ID:1
Name:[Spark]
Score:[100, 110]
===========
Student ID:3
Name:[Hadoop]
Score:[70]
===========
Student ID:2
Name:[Tachyon]
Score:[90, 95, 60]
*/
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值