Java实现Spark groupByKey等算子

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.storage.StorageLevel;
import scala.Tuple2;

import java.util.*;

public class baobiao {
    public static void main(String[] args) {
        SparkSession spark = SparkSession.builder().appName("dup").master("local[4]").getOrCreate();
        JavaRDD<String> input = spark.sparkContext().textFile("/Users/yangyang/Desktop/textbook_exercise_use_report.csv",1)
                .toJavaRDD().map(new Function<String, String>() {
                    public String call(String s) throws Exception {
                        String[] tmp = s.split(",");
                        String[] timestamp= tmp[9].split(" ")[0].split("-");
                        String time = "";
                        for(int i =0 ;i < timestamp.length-1;i++){
                            time += timestamp[i];
                        }
                        tmp[9] = time;
                        String res = "";
                        for(int i = 0;i<tmp.length;i++)
                            res += tmp[i]+",";
                        return res;
                    }
                }).persist(StorageLevel.MEMORY_ONLY());
        JavaPairRDD<String,Integer> all_people= input.mapToPair(new PairFunction<String, String, Integer>() {
            public Tuple2<String,Integer> call(String str) throws Exception{
                String[] tmp = str.split(",");
                String key = "";
                for(int i=1;i<=4;i++)
                    key += tmp[i]+"\t";
                key += tmp[6];
                return new Tuple2(key,1);
            }
        }).groupByKey().mapToPair(new PairFunction<Tuple2<String,Iterable<Integer>>, String, Integer>() {
            public Tuple2<String,Integer> call(Tuple2<String,Iterable<Integer>> pairs) throws Exception{
                String key = pairs._1();
                Iterable<Integer> iter = pairs._2();
                int sum = 0;
                for(Integer i:iter)
                    sum +=i;
                return new Tuple2(key,sum);
            }
        });
        JavaPairRDD<String,Integer> all_times = input.mapToPair(new PairFunction<String, String, Integer>() {
            public Tuple2<String,Integer> call(String str) throws Exception{
                String[] tmp = str.split(",");
                String key = "";
                for(int i=1;i<=4;i++)
                    key += tmp[i]+"\t";
                key += tmp[6];
                return new Tuple2(key,Integer.valueOf(tmp[11]));
            }
        }).groupByKey().mapToPair(new PairFunction<Tuple2<String,Iterable<Integer>>, String, Integer>() {
                    public Tuple2<String,Integer> call(Tuple2<String,Iterable<Integer>> pairs) throws Exception{
                        String key = pairs._1();
                        Iterable<Integer> iter = pairs._2();
                        int sum = 0;
                        for(Integer i:iter)
                            sum +=i;
                        return new Tuple2(key,sum);
                    }
                });
        JavaPairRDD<String,Integer> all_learn_times = input.mapToPair(new PairFunction<String, String, Integer>() {
            public Tuple2<String,Integer> call(String str) throws Exception{
                String[] tmp = str.split(",");
                String key = "";
                for(int i=1;i<=4;i++)
                    key += tmp[i]+"\t";
                key += tmp[6];
                return new Tuple2(key,Integer.valueOf(tmp[10]));
            }
        }).groupByKey().mapToPair(new PairFunction<Tuple2<String,Iterable<Integer>>, String, Integer>() {
            public Tuple2<String,Integer> call(Tuple2<String,Iterable<Integer>> pairs) throws Exception{
                String key = pairs._1();
                Iterable<Integer> iter = pairs._2();
                int sum = 0;
                for(Integer i:iter)
                    sum +=i;
                return new Tuple2(key,sum);
            }
        });
        List<Tuple2<String,Integer>> all_people_list= all_people.collect();
        List<Tuple2<String,Integer>> all_times_list= all_times.collect();
        List<Tuple2<String,Integer>> all_learn_times_list= all_learn_times.collect();
        
        for(Tuple2<String,Integer> people:all_learn_times_list)
            System.out.println(people._1()+"\t"+people._2());
        input.foreach(x -> System.out.println(x));
        spark.close();
    }
}

注:Mac下使用idea开发,可用Alt+/看方法返回类型(本人个人习惯设置,写在此怕忘记)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值