基于排序机制的wordcount程序

22 篇文章 0 订阅

Java代码:

package com.netcloud.spark.sparkcore.projectpractice;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;

import java.util.Arrays;
import java.util.Iterator;

/**
 * Demo_000_SortWordCount
 * 将单词统计的结果 按照统计单词数目进行降序排序
 * @author yangshaojun
 * #date  2019/3/13 16:37
 * @version 1.0
 */
public class Demo_000_SortWordCount {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf();
        conf.setMaster("local").setAppName("Demo_000_SortWordCount");
        JavaSparkContext sc = new JavaSparkContext(conf);

        JavaRDD<String> lineRDD = sc.textFile("data/sparkcore/wordcount.txt");

        /**
         * spark 2.x 使用flatMap的时候 spark 1.x中的Iterable对象 变成了 spark2.x中的Iterator对象
         * 相应的,对于返回值为list的RDD,  spark2.x中要返回list.iterator();
         */
        JavaRDD<String> wordsRDD = lineRDD.flatMap(new FlatMapFunction<String, String>() {

            @Override
            public Iterator<String> call(String s) throws Exception {
                return Arrays.asList(s.split(",")).iterator();
            }
        });

        /**
         * Java 使用 mapToPair 等同于scala中的map  将读取的每个单词 记为 1
         */
        JavaPairRDD<String, Integer> wordTuple2RDD = wordsRDD.mapToPair(new PairFunction<String, String, Integer>() {

            @Override
            public Tuple2<String, Integer> call(String word) throws Exception {
                return new Tuple2<String, Integer>(word, 1);
            }
        });

        /**
         * 使用reduceByKey 进行单词的统计
         */
        JavaPairRDD<String, Integer> retValueRDD = wordTuple2RDD.reduceByKey(new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
            }
        });
        //截止到目前为止,就把上面数据文件中的单词统计出来了,下面我们要进行将统计的单词数目降序排序。
        /**
         * 使用mapToPaire算子将 <K,V>格式的RDD翻转 为 <V,K>
         */
        JavaPairRDD<Integer, String> exchageKVRDD = retValueRDD.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() {

            @Override
            public Tuple2<Integer, String> call(Tuple2<String, Integer> t) throws Exception {
                return new Tuple2<Integer, String>(t._2, t._1);
            }
        });
        /**
         * 按照Key进行降序排序
         */
        JavaPairRDD<Integer, String> sortByKeyRDD = exchageKVRDD.sortByKey(false);
        /**
         * 将 Tuple 的 <Integer,String> 再次翻转<String,Integer>
         */
        JavaPairRDD<String, Integer> retSortRDD = sortByKeyRDD.mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() {

            @Override
            public Tuple2<String, Integer> call(Tuple2<Integer, String> t) throws Exception {
                return new Tuple2<String, Integer>(t._2, t._1);
            }
        });
        /**
         * 遍历最终的结果 打印输出
         */
        retSortRDD.foreach(new VoidFunction<Tuple2<String, Integer>>() {
            @Override
            public void call(Tuple2<String, Integer> tuple2) throws Exception {
                System.out.println(tuple2);
            }
        });
    }
}

 

scala代码:

package com.netcloud.bigdata.spark_core.basiclearning.projectpractice

import org.apache.spark.{SparkConf, SparkContext}

/**
  * Demo_000_SortWordCount
  * 将单词统计的结果 按照统计单词数目进行降序排序
  * @author yangshaojun
  * #date  2019/3/13 17:32
  * @version 1.0
  */
object Demo_000_SortWordCount {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local").setAppName("Demo_000_SortWordCount")
    val sc = new SparkContext(conf)
    val linesRDD = sc.textFile("data/sparkcore/wordcount.txt")
    val wordsRDD = linesRDD.flatMap(line => line.split(","))
    val wordRDD = wordsRDD.map(word => (word, 1))
    val retRDD = wordRDD.reduceByKey(_ + _)
    val exchangeRDD = retRDD.map(t => (t._2, t._1))
    val sortRDD = exchangeRDD.sortByKey(false)
    val normalRDD = sortRDD.map(t => (t._2, t._1))
    normalRDD.foreach(print)
  }

}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值