Spark 实现常用的map reduce功能 (Java版本)

1 篇文章 0 订阅

记录利用spark core的函数,完成一些map reduce功能的练习,spark core有Transformation和Action两种算子,Transformation完成中间转变过程,不会把运算真的算出来,Action才会最终把运算计算出来,所以运算必须以Action算子作为结束。

Transformation算子:
map、filter、 flatMap、groupByKey 、reduceByKey、sortByKey、 cogroup。
Action算子:
reduce()、collect()、 count()、take()、save()、countByKey()。

0、共有的方法:

需要利用JavaSparkContext把数据编程spark的RDD数据,然后才能利用spark算子处理。

public static JavaSparkContext getSC(){
        SparkConf sparkConf = new SparkConf().setAppName("transformation").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(sparkConf);
        return sc;
    }

1、单词计数

public static void wordCount(){

        // 制作数据集:
        List data = Arrays.asList("Google Bye GoodBye Hadoop code", "Java code Bye");

        // 将数据转化为RDD
        JavaSparkContext sc = getSC();
        JavaRDD lines = sc.parallelize(data);

        // 转化逻辑:
        // 一行行转化为 "Google", "Bye"...
        // 然后转为:  ("Google", 1) 的key-value对
        // 最后根据 key 进行合并

        JavaRDD words = lines.flatMap(new FlatMapFunction<String, String>() {
            @Override
            public Iterator call(String lines) throws Exception {
                return Arrays.asList(lines.split(" ")).iterator();
            }
        });

        JavaPairRDD word = words.mapToPair(new PairFunction<String, String, Integer>() {
            @Override
            public Tuple2 call(String word) throws Exception {
                return new Tuple2(word, 1);
            }
        });

        JavaPairRDD wordCnt = word.reduceByKey(new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
            }
        });
        
        wordCnt.foreach(new VoidFunction<Tuple2<String, Integer>>() {
            @Override
            public void call(Tuple2<String, Integer> o) throws Exception {
                System.out.println(o._1 + ":" + o._2);
            }
        });
    }

/* 输出:
Bye:2
Google:1
Java:1
code:2
GoodBye:1
Hadoop:1
*/

2、倒排索引

单词作为Key,文档的ids作为value,查看单词在哪篇文档中出现过。

public static void invertedIndex(){

        // 制作数据
        List data = Arrays.asList(new Tuple2<>(1, "This is the content of document 1 it is very short"),
                new Tuple2<>(2, "This is the content of document 2 it is very long bilabial"),
                new Tuple2<>(3, "This is the a document of 3 I love programming"));

        JavaSparkContext sc = getSC();
        JavaPairRDD<Integer, String> docStr = sc.parallelizePairs(data);

        // 转化逻辑:
        // 用map 将 数据转为 (单词,文档id)的key-value对
        // 用groupByKey 根据单词集合,再用sort排序

        JavaPairRDD<String, Integer> strDocID = docStr.flatMapToPair(new PairFlatMapFunction<Tuple2<Integer, String>, String, Integer>() {
            @Override
            public Iterator<Tuple2<String, Integer>> call(Tuple2<Integer, String> integerStringTuple2) throws Exception {
                List<String> word = Arrays.asList(integerStringTuple2._2.split(" "));
                List<Tuple2<String, Integer>> wordDocID = new ArrayList<>();

				// 这里用Map来完成去重的工作,如果有更好的(key,values)去除values重复的方法,请指教一下
                Map<String, Integer> myMap = new HashMap<>();

                for (String s : word) {
                    if(!myMap.containsKey(word)){
                        myMap.put(s, integerStringTuple2._1);
                    }
                }

                for (Map.Entry<String, Integer> stringIntegerEntry : myMap.entrySet()) {
                    wordDocID.add(new Tuple2<>(stringIntegerEntry.getKey(), stringIntegerEntry.getValue()));
                }
                return wordDocID.iterator();
            }
        });

        JavaPairRDD wordIDs = strDocID.groupByKey();

        JavaPairRDD wordIDsSort = wordIDs.sortByKey(true);

        wordIDsSort.foreach(new VoidFunction<Tuple2<String, Iterable>>() {
            @Override
            public void call(Tuple2<String, Iterable> o) throws Exception {
                System.out.print(o._1 + ":");
                Iterator it = o._2.iterator();
                while(it.hasNext()){
                    System.out.print(it.next() + ",");
                }
                System.out.println("");
            }
        });
    }
/* 输出:
1:1,
2:2,
3:3,
I:3,
This:1,2,3,
a:3,
bilabial:2,
content:1,2,
document:1,2,3,
is:1,2,3,
it:1,2,
long:2,
love:3,
of:1,2,3,
programming:3,
short:1,
the:1,2,3,
very:1,2,
*/

3、N-Gram

N-Gram 相N个单词组成词组,所有的词组出现的次数

public static void nGramSimple(){

        // 制作数据:
        List data = Arrays.asList("abcabc", "abcabc", "bbcabc");
        final int N = 3;

        JavaSparkContext sc = getSC();
        JavaRDD nGramData = sc.parallelize(data);

        // 转化逻辑:
        // (NGram, 1) -> reduceByKey

        JavaPairRDD nGram = nGramData.flatMapToPair(new PairFlatMapFunction<String, String, Integer>() {
            @Override
            public Iterator<Tuple2<String, Integer>> call(String str) throws Exception {
                List<Tuple2<String, Integer>> pairList = new ArrayList<>();
                for(int index = 0; index < str.length() - N + 1; ++index){
                    pairList.add(new Tuple2<>(str.substring(index, index + N), 1));
                }
                return pairList.iterator();
            }
        });

        JavaPairRDD nGramCnt = nGram.reduceByKey(new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
            }
        });

        nGramCnt.foreach(new VoidFunction<Tuple2<String, Integer>>() {
            @Override
            public void call(Tuple2<String, Integer> o) throws Exception {
                System.out.println(o._1 + ":"  + o._2);
            }
        });
    }

4、最常出现的前K个单词

public static void topKFrequentWords(){

        List data = Arrays.asList("a b c d a a a a", "b b f f e e c b b b", "g h i j k f f f");
        final int N = 3;

        JavaSparkContext sc = getSC();

        // 转化逻辑:
        // 先转化为 (word, 1) 的 key-value对,然后reduceByKey
        // 然后 用mapPartitions 在每个分区内 维护一个大小为K的小顶堆 
        // 最后将这些小顶堆的元素 取出,变为一个较小的列表,遍历它,同时维护一个大小为K的小顶堆,最后小顶堆为前K高频词

        JavaRDD topKData = sc.parallelize(data);

        JavaPairRDD word = topKData.flatMapToPair(new PairFlatMapFunction<String, String, Integer>() {
            @Override
            public Iterator<Tuple2<String, Integer>> call(String str) throws Exception {
                List<String> words = Arrays.asList(str.split(" "));
                List<Tuple2<String, Integer>> wordPair = new ArrayList<>();
                for (String s : words) {
                    wordPair.add(new Tuple2<>(s, 1));
                }
                return wordPair.iterator();
            }
        });

        JavaPairRDD wordCnt = word.reduceByKey(new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
            }
        });

        class TopKKey implements Ordered<TopKKey>, Serializable{

            private String word;
            private Integer cnt;

            public void setWord(String word) {
                this.word = word;
            }

            public void setCnt(Integer cnt) {
                this.cnt = cnt;
            }

            public String getWord() {
                return word;
            }

            public Integer getCnt() {
                return cnt;
            }

            public TopKKey(String word, int cnt) {
                this.word = word;
                this.cnt = cnt;
            }

            @Override
            public int compare(TopKKey that) {
                return this.getCnt().compareTo(that.getCnt());
            }
            @Override
            public int compareTo(TopKKey that) {
                return this.getCnt().compareTo(that.getCnt());
            }

            @Override
            public boolean $less(TopKKey that) {
                return false;
            }

            @Override
            public boolean $greater(TopKKey that) {
                return false;
            }

            @Override
            public boolean $less$eq(TopKKey that) {
                return false;
            }

            @Override
            public boolean $greater$eq(TopKKey that) {
                return false;
            }
        }


        JavaRDD topKHeaps = wordCnt.mapPartitions(new FlatMapFunction<Iterator<Tuple2<String, Integer>>, Iterator<TopKKey>>() {
            @Override
            public Iterator call(Iterator<Tuple2<String, Integer>> wordCount) throws Exception {
                PriorityQueue<TopKKey> Q = new PriorityQueue<>();
                while(wordCount.hasNext()){
                    Tuple2<String, Integer> t = wordCount.next();
                    TopKKey tk = new TopKKey(t._1, t._2);
                    if(Q.size() < N){
                        Q.add(tk);
                    }else{
                        TopKKey peek = Q.peek();
                        if(tk.compareTo(peek) > 0){
                            Q.poll();
                            Q.add(tk);
                        }
                    }
                }
                List list = new ArrayList();
                for (TopKKey topKKey : Q) {
                    list.add(topKKey);
                }
                return list.iterator();
            }
        });

        List<TopKKey> topKValues = topKHeaps.collect();
        PriorityQueue<TopKKey> topKHeap = new PriorityQueue<>();

        for (TopKKey value : topKValues) {
            if(topKHeap.size() < N){
                topKHeap.add(value);
            }else{
                TopKKey peek = topKHeap.peek();
                if(value.compareTo(peek) > 0){
                    topKHeap.poll();
                    topKHeap.add(value);
                }
            }

        }

        for (TopKKey topKKey : topKHeap) {
            System.out.println(topKKey.getWord() + ":" + topKKey.getCnt());
        }
    }

5、二次排序

public class SecondSortJava {

    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf().setAppName("wordCountApp").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(sparkConf);

        List list = Arrays.asList("class1 67","class2 89","class1 78",
                "class2 90","class1 99","class3 34","class3 89");

        JavaRDD rdd = sc.parallelize(list);

        JavaPairRDD beginSortValues = rdd.mapToPair(new PairFunction<String, String, Integer>() {
            @Override
            public Tuple2 call(String line) throws Exception {
                String first = line.split(" ")[0];
                int second = Integer.parseInt(line.split(" ")[1]);
                SecondSortKey secondSortKey = new SecondSortKey(first, second);
                return new Tuple2(secondSortKey, line);
            }
        });

        JavaPairRDD sortValues = beginSortValues.sortByKey(false);

        sortValues.foreach(new VoidFunction<Tuple2<SecondSortKey, String>>(){
            @Override
            public void call(Tuple2 o) throws Exception {
                System.out.println(o._2);
            }
        });



    }

}

// ^ + I 实现接口中的虚拟方法
class SecondSortKey implements Ordered<SecondSortKey>, Serializable{

    private String first;
    private int second;

    public SecondSortKey(String first, int second) {
        this.first = first;
        this.second = second;
    }

    // ⌘N setter getter方法
    public void setFirst(String first) {
        this.first = first;
    }

    public void setSecond(int second) {
        this.second = second;
    }

    public String getFirst() {
        return first;
    }

    public int getSecond() {
        return second;
    }

    @Override
    public int compareTo(SecondSortKey that) {
        int comp = this.getFirst().compareTo(that.getFirst());
        if(comp == 0){
            return Integer.valueOf(this.getSecond()).compareTo(that.getSecond());
        }
        return comp;
    }

    @Override
    public int compare(SecondSortKey that) {
        int comp = this.getFirst().compareTo(that.getFirst());
        if(comp == 0){
            return Integer.valueOf(this.getSecond()).compareTo(that.getSecond());
        }
        return comp;
    }


    @Override
    public boolean $less(SecondSortKey that) {
        return false;
    }

    @Override
    public boolean $greater(SecondSortKey that) {
        return false;
    }

    @Override
    public boolean $less$eq(SecondSortKey that) {
        return false;
    }

    @Override
    public boolean $greater$eq(SecondSortKey that) {
        return false;
    }
}
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值