Spark中transformation算子的操作

package com.uplooking.bigdata.core.p2;
 
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
 
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
 
/**
 * java版本的关于Spark中transformation算子的操作
 *   1、map:将集合中每个元素乘以7
     2、filter:过滤出集合中的奇数
     3、flatMap:将行拆分为单词
     4、sample:根据给定的随机种子seed,随机抽样出数量为frac的数据
     5、union:返回一个新的数据集,由原数据集和参数联合而成
     6、groupByKey:对数组进行 group by key操作
     7、reduceByKey:统计每个班级的人数
     8、join:打印关联的组合信息
     9、sortByKey:将学生身高进行排序
     10、cogroup:打印每个学生的身高
 */
public class JavaSparkTransformationOps {
 
    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName(JavaSparkTransformationOps.class.getSimpleName())
                .setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);
//        transformation_1_map_Ops(sc);
//        transformation_2_filter_Ops(sc);
//        transformation_3_flatMap_Ops(sc);
//        transformation_4_sample_Ops(sc);
//        transformation_5_union_Ops(sc);
//        transformation_6_gbk_Ops(sc);
//        transformation_7_rbk_Ops(sc);
//        transformation_8_join_Ops(sc);
        transformation_9_sbk_Ops(sc);
        //关闭sparkContext
        sc.close();
    }
    /**
        9、sortByKey:将学生身高进行排序
        按照key进行排序,就和treemap类似
        在此基础之上,先按照姓名的升序排,如果姓名相同,再按照升高的降序排
    */
    public static void transformation_9_sbk_Ops(JavaSparkContext sc) {
        List<String> list = Arrays.asList(
          "zhangsan 176",
          "xiaodingding 175",
          "xiaobao 173",
          "heyajie 174.5",
          "liujun 173",
          "wangxiaoxiong 150"
        );
        JavaRDD<String> listRDD = sc.parallelize(list);
        JavaPairRDD<String, Double> pairRDD = listRDD.mapToPair(new PairFunction<String, String, Double>() {
 
            @Override
            public Tuple2<String, Double> call(String line) throws Exception {
                String[] splits = line.split(" ");
                String name = splits[0].trim();
                double height = Double.valueOf(splits[1].trim());
                return new Tuple2<String, Double>(name, height);
            }
        });
//        pairRDD.foreach(t -> System.out.println(t));
//        pairRDD.sortByKey(false).foreach(t -> System.out.println(t));
        //按照身高进行降序排序
        JavaPairRDD<Double, String> reversePairRDD = pairRDD.mapToPair(new PairFunction<Tuple2<String, Double>, Double, String>() {
            @Override
            public Tuple2<Double, String> call(Tuple2<String, Double> t) throws Exception {
                return new Tuple2<Double, String>(t._2(), t._1());
            }
        });
        reversePairRDD.sortByKey(false).foreach(t -> System.out.println(t._2 + " " + t._1));
    }
    /**
     * 8、join:打印关联的组合信息
     * join(otherDataset, [numTasks]): 在类型为(K,V)和(K,W)
     * 类型的数据集上调用,返回一个(K,(V,W))对,每个key中的所有元素都
     * 在一起的数据集
     */
    public static void transformation_8_join_Ops(JavaSparkContext sc) {
        List<String> maleList = Arrays.asList(
                "bd_1 male 20",
                "bd_2 male 25",
                "bd_3 male 15");
        List<String> femaleList = Arrays.asList(
                "bd_1 female 2",
                "bd_2 female 10",
                "bd_3 female 5"
        );
        JavaRDD<String> maleListRDD = sc.parallelize(maleList);
        JavaRDD<String> femaleListRDD = sc.parallelize(femaleList);
        JavaPairRDD<String, Integer> malePairRDD = maleListRDD.mapToPair(line -> {
            String[] splits = line.split(" ");
            String className = splits[0].trim();
            int personNum = Integer.valueOf(splits[2].trim());
            return new Tuple2<String, Integer>(className, personNum);
        });
        JavaPairRDD<String, Integer> femalePairRDD = femaleListRDD.mapToPair(line -> {
            String[] splits = line.split(" ");
            String className = splits[0].trim();
            int personNum = Integer.valueOf(splits[2].trim());
            return new Tuple2<String, Integer>(className, personNum);
        });
 
        JavaPairRDD<String, Tuple2<Integer, Integer>> joinRDD = malePairRDD.join(femalePairRDD);
        System.out.println("-------malePairRDD----------");
        malePairRDD.foreach(t -> System.out.println(t));
        System.out.println("-------femalePairRDD----------");
        femalePairRDD.foreach(t -> System.out.println(t));
        System.out.println("-------joinRDD----------");
        joinRDD.foreach(new VoidFunction<Tuple2<String, Tuple2<Integer, Integer>>>() {
            @Override
            public void call(Tuple2<String, Tuple2<Integer, Integer>> t) throws Exception {
                System.out.println("t._1: " + t._1() + ", t._2._1: " + t._2._1() + ", t._2._2: " + t._2._2);
            }
        });
    }
    /**
     * 7、reduceByKey:统计每个班级的人数
     */
    public static void transformation_7_rbk_Ops(JavaSparkContext sc) {
        List<String> list = Arrays.asList(
                    //className 男/女 num
                    "bd_1 male 20",
                    "bd_1 female 2",
                    "bd_2 male 25",
                    "bd_2 female 10",
                    "bd_3 male 15");
        JavaRDD<String> listRDD = sc.parallelize(list);
        JavaPairRDD<String, Integer> pairRDD = listRDD.mapToPair(new PairFunction<String, String, Integer>() {
            /**
             * 返回格式<className, 人数>
             * @param line
             * @return
             * @throws Exception
             */
            @Override
            public Tuple2<String, Integer> call(String line) throws Exception {
                String[] splits = line.split(" ");
                if (splits == null || splits.length < 3) {
                    return null;
                }
                String className = splits[0].trim();
                String gender = splits[1].trim();
                int num = Integer.valueOf(splits[2].trim());
                return new Tuple2<String, Integer>(className, num);
            }
        });
        //统计每个班级中的人数
        JavaPairRDD<String, Integer> retRDD = pairRDD.reduceByKey((t1, t2) -> {return t1 + t2;});
        retRDD.foreach(t -> System.out.println("className: " + t._1() + ", personNum:" + t._2));
    }
    /**
     *  6、groupByKey:对数组进行 group by key操作
     *  groupByKey([numTasks]): 在一个由(K,V)对组成的数据集上调用,返回一个(K,Seq[V])对的数据集。注意:
     *  默认情况下,使用8个并行任务进行分组,你可以传入numTask可选参数,
     *  根据数据量设置不同数目的Task
     *
     *  注意:
     *      不到万不得已,不要轻易是groupByKey,因为要对每一个key进行聚合操作,
     *    所以groupByKey会到各个partition中拉去数据,类似于reduceByKey,但是
     *    groupByKey和reduceByKey有一点不同,就是groupByKey不会进行本地的预聚合,
     *    而reduceByKey会进行本地预聚合,所以groupByKey的操作会有大量数据传输,
     *    造成大量shuffle,进而影响程序的性能
     */
    public static void transformation_6_gbk_Ops(JavaSparkContext sc) {
        List<String> list = Arrays.asList("hello you", "hello me", "hello me");
        JavaRDD<String> listRDD = sc.parallelize(list);
        JavaRDD<String> wordsRDD = listRDD.flatMap(line -> {
            return Arrays.asList(line.split(" "));
        });
        JavaPairRDD<String, Integer> pairRDD = wordsRDD.mapToPair(word -> {
            return new Tuple2<String, Integer>(word, 1);
        });
        pairRDD.foreach(t -> System.out.println(t._1 + " " + t._2()));
        JavaPairRDD<String, Iterable<Integer>> gbkRDD = pairRDD.groupByKey();
        gbkRDD.foreach(new VoidFunction<Tuple2<String, Iterable<Integer>>>() {
            @Override
            public void call(Tuple2<String, Iterable<Integer>> t) throws Exception {
                System.out.println(t._1 + " " + t._2);
            }
        });
    }
    /**
     * 5、union:返回一个新的数据集,由原数据集和参数联合而成
     * union(otherDataset): 返回一个新的数据集,由原数据集和参数联合而成
     * @param sc
     */
    public static void transformation_5_union_Ops(JavaSparkContext sc) {
        JavaRDD<Integer> oddRDD = sc.parallelize(Arrays.asList(1, 3, 5, 7, 9));
        JavaRDD<Integer> evenRDD = sc.parallelize(Arrays.asList(2, 4, 6, 8, 10));
 
        JavaRDD<Integer> unionRDD = oddRDD.union(evenRDD);
        unionRDD.foreach(t -> System.out.println(t));
 
    }
 
 
    /**
     * 4、sample:根据给定的随机种子seed,随机抽样出数量为frac的数据
     * 从一个rdd中进行数据的抽样,从整体数据中抽取frac(整体数据的比值),用样本数据来评估整体数据分布的情况
     * 经常用作spark中出现数据倾斜的时候进行样本数据评估整体数据
     */
    public static void transformation_4_sample_Ops(JavaSparkContext sc) {
        List<Integer> list = new ArrayList<Integer>(10001);
        for(int i = 0; i < 10000; i++) {
            list.add(i);
        }
        JavaRDD<Integer> listRDD = sc.parallelize(list);
        //sample得到的样本数据,不一定是精准的比值数据,在其左右范围内
        JavaRDD<Integer> sampleRDD = listRDD.sample(true, 0.01);
        System.out.println("sampleRDD的数据规模:" + sampleRDD.count());
        sampleRDD.foreach(i -> System.out.println(i));
    }
 
    /**
     * 3、flatMap:将行拆分为单词
     * 比如进入flatMap中的数据为一个字符串"hello you me",那么同构flatMap这个算子,一般就可以对该字符串进行拆解
     * 根据定义的规则,将其转换为一个Seq[String]--->
     * hello
     * you
     * me
     * @param sc
     */
    public static void transformation_3_flatMap_Ops(JavaSparkContext sc) {
        List<String> list = Arrays.asList("hello you", "hello me");
        JavaRDD<String> listRDD = sc.parallelize(list);
        JavaRDD<String> wordsRDD = listRDD.flatMap(new FlatMapFunction<String, String>() {
 
            @Override
            public Iterable<String> call(String line) throws Exception {
//                System.out.println("line==>" + line);
                return Arrays.asList(line.split(" "));
            }
        });
        wordsRDD.foreach(word -> System.out.println(word));
    }
    /*
         2、filter(func):过滤出集合中的奇数
         filter函数会过滤掉不满足func的数据,func函数的返回值类型为Boolean,
         filter会过滤掉func返回值为false的这些数据,保留func返回值为true数据
         所以通过filter得到数据是源rdd的子集
     */
    public static void transformation_2_filter_Ops(JavaSparkContext sc) {
        List<Integer> list = Arrays.asList(1, 2, 3, 4, 5);
        JavaRDD<Integer> listRDD = sc.parallelize(list);
        JavaRDD<Integer> filteredRDD = listRDD.filter(new Function<Integer, Boolean>() {
            @Override
            public Boolean call(Integer v1) throws Exception {
                return v1 % 2 == 0;
            }
        });
 
        filteredRDD.foreach(v -> System.out.println(v));
 
 
    }
    /*
        map(func):返回一个新的分布式数据集,由每个原元素经过func函数转换后组成
            不会改变又有rdd中的元素的个数,新生成的rdd中元素的个数和原rdd中元素的个数相等
        1、map:将集合中每个元素乘以7
     */
    public static void transformation_1_map_Ops(JavaSparkContext sc) {
        List<Integer> list = Arrays.asList(1, 2, 3, 4, 5);
        JavaRDD<Integer> listRDD = sc.parallelize(list);
        JavaRDD<Integer> retRDD = listRDD.map(num -> {
            return num * 7;
        });
 
        retRDD.foreach(new VoidFunction<Integer>() {
            @Override
            public void call(Integer i) throws Exception {
                System.out.println(i);
            }
        });
    }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值