spark 高级算子

本问主要通过java代码实现spark的高级算子功能

1 aggregateBykey

// aggregateByKey,分为三个参数
// reduceByKey认为是aggregateByKey的简化版
// aggregateByKey最重要的一点是,多提供了一个函数,Seq Function
// 就是说自己可以控制如何对每个partition中的数据进行先聚合,类似于mapreduce中的,map-side combine
// 然后才是对所有partition中的数据进行全局聚合
// 第一个参数是,每个key的初始值
// 第二个是个函数,Seq Function,如何进行shuffle map-side的本地聚合

// 第三个是个函数,Combiner Function,如何进行shuffle reduce-side的全局聚合

import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;

import scala.Tuple2;

public class AggregateByKey {
   
   public static void main(String[] args) {
      SparkConf conf = new SparkConf()
            .setAppName("AggregateByKey")
            .setMaster("local");       
      JavaSparkContext sc = new JavaSparkContext(conf);
      
      JavaRDD<String> lines = sc.textFile(
            "C://Users//Administrator//Desktop//hello.txt",
            3); 
      
      JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
         
         private static final long serialVersionUID = 1L;
         
         @Override
         public Iterable<String> call(String line) throws Exception {
            return Arrays.asList(line.split(" "));  
         }
         
      });
      
      JavaPairRDD<String, Integer> pairs = words.mapToPair(
            
            new PairFunction<String, String, Integer>() {

               private static final long serialVersionUID = 1L;
               
               @Override
               public Tuple2<String, Integer> call(String word) throws Exception {
                  return new Tuple2<String, Integer>(word, 1);
               }
               
            });
      

      
      JavaPairRDD<String, Integer> wordCounts = pairs.aggregateByKey(
            0, 
            
            new Function2<Integer, Integer, Integer>() {

               private static final long serialVersionUID = 1L;

               @Override
               public Integer call(Integer v1, Integer v2)
                     throws Exception {
                  return v1 + v2;
               }
               
            }, 
            
            new Function2<Integer, Integer, Integer>() {

               private static final long serialVersionUID = 1L;

               @Override
               public Integer call(Integer v1, Integer v2)
                     throws Exception {
                  return v1 + v2;
               }
               
            });
      
      List<Tuple2<String, Integer>> wordCountList = wordCounts.collect();
      for(Tuple2<String, Integer> wordCount : wordCountList) {
         System.out.println(wordCount);  
      }
      
      sc.close();
   }
   
}

2 cartesin

// cartesian,中文名,笛卡尔乘积
// 比如说两个RDD,分别有10条数据,用了cartesian算子以后
// 两个RDD的每一条数据都会和另外一个RDD的每一条数据执行一次join

// 最终组成了一个笛卡尔乘积

import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;

import scala.Tuple2;

public class Cartesian {
   
   public static void main(String[] args) {
      SparkConf conf = new SparkConf()
            .setAppName("Cartesian")
            .setMaster("local");  
      JavaSparkContext sc = new JavaSparkContext(conf);
      
   
      
      // 小案例
      // 比如说,现在5件衣服,5条裤子,分别属于两个RDD
      // 就是说,需要对每件衣服都和每天裤子做一次join,尝试进行服装搭配
      
      List<String> clothes = Arrays.asList("夹克", "T恤", "皮衣", "风衣");  
      JavaRDD<String> clothesRDD = sc.parallelize(clothes);
      
      List<String> trousers = Arrays.asList("皮裤", "运动裤", "牛仔裤", "休闲裤");  
      JavaRDD<String> trousersRDD = sc.parallelize(trousers);
      
      JavaPairRDD<String, String> pairsRDD = clothesRDD.cartesian(trousersRDD);

      for(Tuple2<String, String> pair : pairsRDD.collect()) {
         System.out.println(pair);  
      }
      
      sc.close();
   }
   
}

3 coalesce

// coalesce算子,功能是将RDD的partition缩减,减少
// 将一定量的数据,压缩到更少的partition中去
// 建议的使用场景,配合filter算子使用
// 使用filter算子过滤掉很多数据以后,比如30%的数据,出现了很多partition中的数据不均匀的情况
// 此时建议使用coalesce算子,压缩rdd的partition数量
// 从而让各个partition中的数据都更加的紧凑
// 公司原先有6个部门
// 但是呢,不巧,碰到了公司裁员,裁员以后呢,有的部门中的人员就没了
// 不同的部分人员不均匀

// 此时呢,做一个部门整合的操作,将不同的部门的员工进行压缩

public class Coalesce {

   public static void main(String[] args) {
      SparkConf conf = new SparkConf()
            .setAppName("Coalesce")
            .setMaster("local");
      JavaSparkContext sc = new JavaSparkContext(conf);
   

      
      List<String> staffList = Arrays.asList("张三", "李四", "王二", "麻子",
            "赵六", "王五", "李大个", "王大妞", "小明", "小倩");  
      JavaRDD<String> staffRDD = sc.parallelize(staffList, 6); 
      
      JavaRDD<String> staffRDD2 = staffRDD.mapPartitionsWithIndex(
            
            new Function2<Integer, Iterator<String>, Iterator<String>>() {

               private static final long serialVersionUID = 1L;
      
               @Override
               public Iterator<String> call(Integer index, Iterator<String> iterator)
                     throws Exception {
                  List<String> list = new ArrayList<String>();
                  
                  while(iterator.hasNext()) {
                     String staff = iterator.next();
                     list.add("部门[" + (index + 1) + "], " + staff);
                  }
                  
                  return list.iterator();
               }
               
            }, true);
      
      for(String staffInfo : staffRDD2.collect()) {
         System.out.println(staffInfo);  
      }
      
      JavaRDD<String> staffRDD3 = staffRDD2.coalesce(3);
      
      JavaRDD<String> staffRDD4 = staffRDD3.mapPartitionsWithIndex(
            
            new Function2<Integer, Iterator<String>, Iterator<String>>() {

               private static final long serialVersionUID = 1L;
      
               @Override
               public Iterator<String> call(Integer index, Iterator<String> iterator)
                     throws Exception {
                  List<String> list = new ArrayList<String>();
                  
                  while(iterator.hasNext()) {
                     String staff = iterator.next();
                     list.add("部门[" + (index + 1) + "], " + staff);
                  }
                  
                  return list.iterator();
               }
               
            }, true);
      
      for(String staffInfo : staffRDD4.collect()) {
         System.out.println(staffInfo);  
      }
      
      sc.close();
   }
   
}

4 distinct

// distinct算子
// 对rdd中的数据进行去重
// uv统计案例
// uv:user view,每天每个用户可能对网站会点击多次
// 此时,需要对用户进行去重,然后统计出每天有多少个用户访问了网站

// 而不是所有用户访问了网站多少次(pv)

import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;

public class Distinct {

   public static void main(String[] args) {
      SparkConf conf = new SparkConf()
            .setAppName("Distinct")
            .setMaster("local");
      JavaSparkContext sc = new JavaSparkContext(conf);
      

      
      List<String> accessLogs = Arrays.asList(
            "user1 2016-01-01 23:58:42", 
            "user1 2016-01-01 23:58:43", 
            "user1 2016-01-01 23:58:44", 
            "user2 2016-01-01 12:58:42",
            "user2 2016-01-01 12:58:46", 
            "user3 2016-01-01 12:58:42", 
            "user4 2016-01-01 12:58:42", 
            "user5 2016-01-01 12:58:42", 
            "user6 2016-01-01 12:58:42", 
            "user6 2016-01-01 12:58:45");  
      JavaRDD<String> accessLogsRDD = sc.parallelize(accessLogs);
      
      JavaRDD<String> useridsRDD = accessLogsRDD.map(new Function<String, String>() {

         private static final long serialVersionUID = 1L;

         @Override
         public String call(String accessLog) throws Exception {
            String userid = accessLog.split(" ")[0];
            return userid;
         }
         
      });
      
      JavaRDD<String> distinctUseridsRDD = useridsRDD.distinct();
      int uv = distinctUseridsRDD.collect().size();
      System.out.println("uv: " + uv);  
      
      sc.close();
   }
   
}

5 intersection

// intersection算子
// 获取两个rdd中,相同的数据
// 有的公司内,有些人可能同时在做不同的项目,属于不同的项目组

// 所以要针对代表两个项目组同事的rdd,取出其交集

import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;

public class Intersection {
   
   public static void main(String[] args) {
      SparkConf conf = new SparkConf()
            .setAppName("Intersection")
            .setMaster("local");  
      JavaSparkContext sc = new JavaSparkContext(conf);
      List<String> project1MemberList = Arrays.asList("张三", "李四", "王二", "麻子");  
      JavaRDD<String> project1MemberRDD = sc.parallelize(project1MemberList);
      List<String> project2MemberList = Arrays.asList("张三", "王五", "小明", "小倩");  
      JavaRDD<String> project2MemberRDD = sc.parallelize(project2MemberList);
      
      JavaRDD<String> projectIntersectionRDD = project1MemberRDD.intersection(project2MemberRDD);

      for(String member : projectIntersectionRDD.collect()) {
         System.out.println(member);  
      }
      
      sc.close();
   }
   
}

6 mappartions

// mapPartitions
// 类似map,不同之处在于,map算子,一次就处理一个partition中的一条数据
// mapPartitions算子,一次处理一个partition中所有的数据
// 推荐的使用场景
// 如果你的RDD的数据量不是特别大,那么建议采用mapPartitions算子替代map算子,可以加快处理速度
// 但是如果你的RDD的数据量特别大,比如说10亿,不建议用mapPartitions,可能会内存溢出

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;

public class MapPartitions {
   
   public static void main(String[] args) {
      SparkConf conf = new SparkConf()
            .setAppName("MapPartitions")
            .setMaster("local");  
      JavaSparkContext sc = new JavaSparkContext(conf);
   
      // 准备一下模拟数据
      List<String> studentNames = Arrays.asList("张三", "李四", "王二", "麻子");  
      JavaRDD<String> studentNamesRDD = sc.parallelize(studentNames, 2);
      
      final Map<String, Double> studentScoreMap = new HashMap<String, Double>();
      studentScoreMap.put("张三", 278.5);  
      studentScoreMap.put("李四", 290.0);  
      studentScoreMap.put("王二", 301.0);  
      studentScoreMap.put("麻子", 205.0);  
      
      
      JavaRDD<Double> studentScoresRDD = studentNamesRDD.mapPartitions(
            
            new FlatMapFunction<Iterator<String>, Double>() {

               private static final long serialVersionUID = 1L;
      
               @Override
               public Iterable<Double> call(Iterator<String> iterator)
                     throws Exception {
                  // 因为算子一次处理一个partition的所有数据
                  // call函数接收的参数,是iterator类型,代表了partition中所有数据的迭代器
                  // 返回的是一个iterable类型,代表了返回多条记录,通常使用List类型
                  
                  List<Double> studentScoreList = new ArrayList<Double>();
                  
                  while(iterator.hasNext()) {
                     String studentName = iterator.next();
                     Double studentScore = studentScoreMap.get(studentName);
                     studentScoreList.add(studentScore);
                  }
                  
                  return studentScoreList;
               }
               
            });
      
      for(Double studentScore: studentScoresRDD.collect()) {
         System.out.println(studentScore);  
      }
      
      sc.close();
   }
   
}

7 repartition

// repartition算子,用于任意将rdd的partition增多,或者减少
// 与coalesce不同之处在于,coalesce仅仅能将rdd的partition变少
// 但是repartition可以将rdd的partiton变多

// 建议使用的场景
// 一个很经典的场景,使用Spark SQL从hive中查询数据时
// Spark SQL会根据hive对应的hdfs文件的block数量还决定加载出来的数据rdd有多少个partition
// 这里的partition数量,是我们根本无法设置的

// 有些时候,可能它自动设置的partition数量过于少了,导致我们后面的算子的运行特别慢
// 此时就可以在Spark SQL加载hive数据到rdd中以后
// 立即使用repartition算子,将rdd的partition数量变多

// 案例
// 公司要增加新部门
// 但是人员还是这么多,所以我们只能使用repartition操作,增加部门

// 将人员平均分配到更多的部门中去

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;

public class Repartition {

   public static void main(String[] args) {
      SparkConf conf = new SparkConf()
            .setAppName("Repartition")
            .setMaster("local");
      JavaSparkContext sc = new JavaSparkContext(conf);
   

      
      List<String> staffList = Arrays.asList("张三", "李四", "王二", "麻子",
            "赵六", "王五", "李大个", "王大妞", "小明", "小倩");  
      JavaRDD<String> staffRDD = sc.parallelize(staffList, 3); 
      
      JavaRDD<String> staffRDD2 = staffRDD.mapPartitionsWithIndex(
            
            new Function2<Integer, Iterator<String>, Iterator<String>>() {

               private static final long serialVersionUID = 1L;
      
               @Override
               public Iterator<String> call(Integer index, Iterator<String> iterator)
                     throws Exception {
                  List<String> list = new ArrayList<String>();
                  
                  while(iterator.hasNext()) {
                     String staff = iterator.next();
                     list.add("部门[" + (index + 1) + "], " + staff);
                  }
                  
                  return list.iterator();
               }
               
            }, true);
      
      for(String staffInfo : staffRDD2.collect()) {
         System.out.println(staffInfo);  
      }
      
      JavaRDD<String> staffRDD3 = staffRDD2.repartition(6);
      
      JavaRDD<String> staffRDD4 = staffRDD3.mapPartitionsWithIndex(
            
            new Function2<Integer, Iterator<String>, Iterator<String>>() {

               private static final long serialVersionUID = 1L;
      
               @Override
               public Iterator<String> call(Integer index, Iterator<String> iterator)
                     throws Exception {
                  List<String> list = new ArrayList<String>();
                  
                  while(iterator.hasNext()) {
                     String staff = iterator.next();
                     list.add("部门[" + (index + 1) + "], " + staff);
                  }
                  
                  return list.iterator();
               }
               
            }, true);
      
      for(String staffInfo : staffRDD4.collect()) {
         System.out.println(staffInfo);  
      }
      
      sc.close();
   }
   
}

8 smaple

// sample算子
// 可以使用指定的比例,比如说0.1或者0.9,从RDD中随机抽取10%或者90%的数据
// 从RDD中随机抽取数据的功能
// 推荐不要设置第三个参数,feed

9 takesmapled

import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;

public class Sample {

   public static void main(String[] args) {
      SparkConf conf = new SparkConf()
            .setAppName("Sample")
            .setMaster("local");
      JavaSparkContext sc = new JavaSparkContext(conf);
   
      List<String> staffList = Arrays.asList("张三", "李四", "王二", "麻子",
            "赵六", "王五", "李大个", "王大妞", "小明", "小倩");  
      JavaRDD<String> staffRDD = sc.parallelize(staffList);
      
      
      
      JavaRDD<String> luckyStaffRDD = staffRDD.sample(false, 0.1);
      
      for(String staff : luckyStaffRDD.collect()) {
         System.out.println(staff);  
      }
      
      sc.close();
   }
   
}

10 union

// union算子

// 将两个RDD的数据,合并为一个RDD

import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;

public class Union {
   
   public static void main(String[] args) {
      SparkConf conf = new SparkConf()
            .setAppName("Union") 
            .setMaster("local");  
      JavaSparkContext sc = new JavaSparkContext(conf);
   
      
      
      List<String> department1StaffList = Arrays.asList("张三", "李四", "王二", "麻子");  
      JavaRDD<String> department1StaffRDD = sc.parallelize(department1StaffList);
      
      List<String> department2StaffList = Arrays.asList("赵六", "王五", "小明", "小倩");  
      JavaRDD<String> department2StaffRDD = sc.parallelize(department2StaffList);
      
      JavaRDD<String> departmentStaffRDD = department1StaffRDD.union(department2StaffRDD);
      
      for(String staff : departmentStaffRDD.collect()) {
         System.out.println(staff);  
      }
      
      sc.close();
   }
   
}

阅读更多
想对作者说点什么? 我来说一句

没有更多推荐了,返回首页