






package cn.spark.study.core;

import java.util.Arrays;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;

import scala.Tuple2;

public class SortWordCount {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setAppName("SortWordCount").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);
        JavaRDD<String> lines = sc.textFile("D:\\test-file\\spark.txt");
        JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {

            private static final long serialVersionUID = 1L;

            public Iterable<String> call(String t) throws Exception {
                return Arrays.asList(t.split(" "));
        JavaPairRDD<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() {

            private static final long serialVersionUID = 1L;

            public Tuple2<String, Integer> call(String t) throws Exception {
                return new Tuple2<String, Integer>(t, 1);
        JavaPairRDD<String, Integer> wordCounts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {

            private static final long serialVersionUID = 1L;

            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
        // 到这里为止,就得到了每个单词出现的次数
        // 但是,问题是,我们的新需求,是要按照每个单词出现次数的顺序,降序排序
        // wordCounts RDD内的元素是什么?应该是这种格式的吧:(hello, 3) (you, 2)
        // 我们需要将RDD转换成(3, hello) (2, you)的这种格式,才能根据单词出现次数进行排序把!
        // 进行key-value的反转映射
        JavaPairRDD<Integer, String> countWords = wordCounts.mapToPair(new PairFunction<Tuple2<String,Integer>, Integer, String>() {

            private static final long serialVersionUID = 1L;

            public Tuple2<Integer, String> call(Tuple2<String, Integer> t) throws Exception {
                return new Tuple2<Integer, String>(t._2, t._1);
        JavaPairRDD<Integer, String> sortedCountWords = countWords.sortByKey(false);
        JavaPairRDD<String, Integer> sortedWordCounts = sortedCountWords.mapToPair(new PairFunction<Tuple2<Integer,String>, String, Integer>() {

            private static final long serialVersionUID = 1L;

            public Tuple2<String, Integer> call(Tuple2<Integer, String> t) throws Exception {
                return new Tuple2<String, Integer>(t._2, t._1);

        // 到此为止,我们获得了按照单词出现次数排序后的单词计数
        // 打印出来
        sortedWordCounts.foreach(new VoidFunction<Tuple2<String,Integer>>() {

            private static final long serialVersionUID = 1L;

            public void call(Tuple2<String, Integer> t) throws Exception {
                System.out.println(t._1 + " appears " + t._2 + " times.");



package cn.spark.study.core

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

 * @author Administrator
object SortWordCount {
  def main(args: Array[String]) {
    val conf = new SparkConf()
    val sc = new SparkContext(conf)
    val lines = sc.textFile("D:\\test-file\\spark.txt", 1)
    val words = lines.flatMap { line => line.split(" ") }  
    val pairs = words.map { word => (word, 1) }  
    val wordCounts = pairs.reduceByKey(_ + _)  
    val countWords = wordCounts.map(wordCount => (wordCount._2, wordCount._1))   
    val sortedCountWords = countWords.sortByKey(false)  
    val sortedWordCounts = sortedCountWords.map(sortedCountWord => (sortedCountWord._2, sortedCountWord._1))  
    sortedWordCounts.foreach(sortedWordCount => println(
        sortedWordCount._1 + " appear " + sortedWordCount._2 + " times."))







package cn.spark.study.core;

import java.io.Serializable;

import scala.math.Ordered;

 * 自定义的二次排序key
 * @author Administrator
public class SecondarySortKey implements Ordered<SecondarySortKey>, Serializable {

    private static final long serialVersionUID = -2366006422945129991L;
    // 首先在自定义key里面,定义需要进行排序的列
    private int first;
    private int second;
    public SecondarySortKey(int first, int second) {
        this.first = first;
        this.second = second;

    public boolean $greater(SecondarySortKey other) {
        if(this.first > other.getFirst()) {
            return true;
        } else if(this.first == other.getFirst() && 
                this.second > other.getSecond()) {
            return true;
        return false;
    public boolean $greater$eq(SecondarySortKey other) {
        if(this.$greater(other)) {
            return true;
        } else if(this.first == other.getFirst() && 
                this.second == other.getSecond()) {
            return true;
        return false;

    public boolean $less(SecondarySortKey other) {
        if(this.first < other.getFirst()) {
            return true;
        } else if(this.first == other.getFirst() && 
                this.second < other.getSecond()) {
            return true;
        return false;
    public boolean $less$eq(SecondarySortKey other) {
        if(this.$less(other)) {
            return true;
        } else if(this.first == other.getFirst() && 
                this.second == other.getSecond()) {
            return true;
        return false;
    public int compare(SecondarySortKey other) {
        if(this.first - other.getFirst() != 0) {
            return this.first - other.getFirst();
        } else {
            return this.second - other.getSecond();
    public int compareTo(SecondarySortKey other) {
        if(this.first - other.getFirst() != 0) {
            return this.first - other.getFirst();
        } else {
            return this.second - other.getSecond();
    // 为要进行排序的多个列,提供getter和setter方法,以及hashcode和equals方法
    public int getFirst() {
        return first;

    public void setFirst(int first) {
        this.first = first;

    public int getSecond() {
        return second;

    public void setSecond(int second) {
        this.second = second;

    public int hashCode() {
        final int prime = 31;
        int result = 1;
        result = prime * result + first;
        result = prime * result + second;
        return result;

    public boolean equals(Object obj) {
        if (this == obj)
            return true;
        if (obj == null)
            return false;
        if (getClass() != obj.getClass())
            return false;
        SecondarySortKey other = (SecondarySortKey) obj;
        if (first != other.first)
            return false;
        if (second != other.second)
            return false;
        return true;


package cn.spark.study.core;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;

import scala.Tuple2;

 * 二次排序
 * 1、实现自定义的key,要实现Ordered接口和Serializable接口,在key中实现自己对多个列的排序算法
 * 2、将包含文本的RDD,映射成key为自定义key,value为文本的JavaPairRDD
 * 3、使用sortByKey算子按照自定义的key进行排序
 * 4、再次映射,剔除自定义的key,只保留文本行
 * @author Administrator
public class SecondarySort {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
        JavaSparkContext sc = new JavaSparkContext(conf);
        JavaRDD<String> lines = sc.textFile("D:\\test-file\\sort.txt");
        JavaPairRDD<SecondarySortKey, String> pairs = lines.mapToPair(
                new PairFunction<String, SecondarySortKey, String>() {

                    private static final long serialVersionUID = 1L;

                    public Tuple2<SecondarySortKey, String> call(String line) throws Exception {
                        String[] lineSplited = line.split(" ");  
                        SecondarySortKey key = new SecondarySortKey(
                        return new Tuple2<SecondarySortKey, String>(key, line);
        JavaPairRDD<SecondarySortKey, String> sortedPairs = pairs.sortByKey();
        JavaRDD<String> sortedLines = sortedPairs.map(
                new Function<Tuple2<SecondarySortKey,String>, String>() {

                    private static final long serialVersionUID = 1L;

                    public String call(Tuple2<SecondarySortKey, String> v1) throws Exception {
                        return v1._2;
        sortedLines.foreach(new VoidFunction<String>() {

            private static final long serialVersionUID = 1L;

            public void call(String t) throws Exception {



package cn.spark.study.core

 * @author Administrator
class SecondSortKey(val first: Int, val second: Int) 
    extends Ordered[SecondSortKey] with Serializable {
  def compare(that: SecondSortKey): Int = {
    if(this.first - that.first != 0) {
      this.first - that.first
    } else {
      this.second - that.second


package cn.spark.study.core

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

 * @author Administrator
object SecondSort {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    val sc = new SparkContext(conf)
    val lines = sc.textFile("D:\\test-file\\sort.txt", 1)
    val pairs = lines.map { line => (
        new SecondSortKey(line.split(" ")(0).toInt, line.split(" ")(1).toInt),
    val sortedPairs = pairs.sortByKey()
    val sortedLines = sortedPairs.map(sortedPair => sortedPair._2)  
    sortedLines.foreach { sortedLine => println(sortedLine) }  








package cn.spark.study.core;

import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;

import scala.Tuple2;

public class Top3 {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setAppName("Top3Java").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);
        JavaRDD<String> lines = sc.textFile("D:\\test-file\\top.txt"); 

        JavaPairRDD<Integer, String> pairs = lines.mapToPair(new PairFunction<String, Integer, String>() {

            private static final long serialVersionUID = 1L;

            public Tuple2<Integer, String> call(String t) throws Exception {
                return new Tuple2<Integer, String>(Integer.valueOf(t), t);
        JavaPairRDD<Integer, String> sortedPairs = pairs.sortByKey(false);
        JavaRDD<Integer> sortedNumbers = sortedPairs.map(new Function<Tuple2<Integer,String>, Integer>() {

            private static final long serialVersionUID = 1L;

            public Integer call(Tuple2<Integer, String> v1) throws Exception {
                return v1._1;
        List<Integer> sortedNumberList = sortedNumbers.take(3);
        for(Integer num : sortedNumberList) {


package cn.spark.study.core

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

 * @author Administrator
object Top3 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    val sc = new SparkContext(conf)
    val lines = sc.textFile("D:\\test-file\\top.txt", 1)
    val pairs = lines.map { line => (line.toInt, line) }
    val sortedPairs = pairs.sortByKey(false)
    val sortedNumbers = sortedPairs.map(sortedPair => sortedPair._1)  
    val top3Number = sortedNumbers.take(3)
    for(num <- top3Number) {



package cn.spark.study.core;

import java.util.Arrays;
import java.util.Iterator;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;

import scala.Tuple2;

 * 分组取top3
 * @author Administrator
public class GroupTop3 {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
        JavaSparkContext sc = new JavaSparkContext(conf);
        JavaRDD<String> lines = sc.textFile("D:\\test-file\\score.txt");
        JavaPairRDD<String, Integer> pairs = lines.mapToPair(
                new PairFunction<String, String, Integer>() {

                    private static final long serialVersionUID = 1L;

                    public Tuple2<String, Integer> call(String line) throws Exception {
                        String[] lineSplited = line.split(" ");  
                        return new Tuple2<String, Integer>(lineSplited[0], 
        JavaPairRDD<String, Iterable<Integer>> groupedPairs = pairs.groupByKey();
        JavaPairRDD<String, Iterable<Integer>> top3Score = groupedPairs.mapToPair(
                new PairFunction<Tuple2<String,Iterable<Integer>>, String, Iterable<Integer>>() {

                    private static final long serialVersionUID = 1L;

                    public Tuple2<String, Iterable<Integer>> call(
                            Tuple2<String, Iterable<Integer>> classScores)
                            throws Exception {
                        Integer[] top3 = new Integer[3];
                        String className = classScores._1;
                        Iterator<Integer> scores = classScores._2.iterator();
                        while(scores.hasNext()) {
                            Integer score = scores.next();
                            for(int i = 0; i < 3; i++) {
                                if(top3[i] == null) {
                                    top3[i] = score;
                                } else if(score > top3[i]) {
                                    for(int j = 2; j > i; j--) {
                                        top3[j] = top3[j - 1];  
                                    top3[i] = score;
                        return new Tuple2<String, 
                                Iterable<Integer>>(className, Arrays.asList(top3));    
        top3Score.foreach(new VoidFunction<Tuple2<String,Iterable<Integer>>>() {
            private static final long serialVersionUID = 1L;

            public void call(Tuple2<String, Iterable<Integer>> t) throws Exception {
                System.out.println("class: " + t._1);  
                Iterator<Integer> scoreIterator = t._2.iterator();
                while(scoreIterator.hasNext()) {
                    Integer score = scoreIterator.next();


package cn.spark.study.core

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

object GroupTop3 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("GroupTop3Scala").setMaster("local")
    val context = new SparkContext(conf)
    val linesRDD = context.textFile("D:\\test-file\\score.txt")
    val studentScores = linesRDD.map(line => (line.split(" ")(0), line.split(" ")(1).toInt))
    val groupStudentScores = studentScores.groupByKey()
    val result = groupStudentScores.map(student => {
      val maxScore = new Array[Int](3)
      val scores = student._2
      for(score <- scores) {
        var flag = true
        for(i <- 0 until maxScore.length if flag) {
          if(maxScore(i) == Nil) {
            maxScore(i) = score
            flag = false
            if(maxScore(i) < score) {
              for(j <-  (i + 1 to maxScore.length - 1).reverse){
                maxScore(j) = maxScore(j - 1)
              maxScore(i) = score
              flag = false
      (student._1, maxScore)
    result.foreach(result =>{
      print(result._1 + "班级前三明成绩为:")
      for(i <- 0 until result._2.length) {
        if(i == 0) print(result._2(i))
        else print("," + result._2(i))






