案例需求:
1、对文本文件内的数字,取最大的前3个。
2、对每个班级内的学生成绩,取出前3名。(分组取topN)
实例:
Java版本:topN和分组TopN
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
/**
* 取最大的前3个数字
* @author Administrator
*
*/
public class Top3 {
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setAppName("Top3")
.setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> lines = sc.textFile("C://Users//Administrator//Desktop//top.txt");
JavaPairRDD<Integer, String> pairs = lines.mapToPair(
new PairFunction<String, Integer, String>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<Integer, String> call(String t) throws Exception {
return new Tuple2<Integer, String>(Integer.valueOf(t), t);
}
});
JavaPairRDD<Integer, String> sortedPairs = pairs.sortByKey(false);
JavaRDD<Integer> sortedNumbers = sortedPairs.map(
new Function<Tuple2<Integer,String>, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public Integer call(Tuple2<Integer, String> v1) throws Exception {
return v1._1;
}
});
List<Integer> sortedNumberList = sortedNumbers.take(3);
for(Integer num : sortedNumberList) {
System.out.println(num);
}
sc.close();
}
}
import java.util.Arrays;
import java.util.Iterator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
/**
* 分组取top3
* @author Administrator
*
*/
public class GroupTop3 {
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setAppName("Top3")
.setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> lines = sc.textFile("C://Users//Administrator//Desktop//score.txt");
JavaPairRDD<String, Integer> pairs = lines.mapToPair(
new PairFunction<String, String, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, Integer> call(String line) throws Exception {
String[] lineSplited = line.split(" ");
return new Tuple2<String, Integer>(lineSplited[0],
Integer.valueOf(lineSplited[1]));
}
});
JavaPairRDD<String, Iterable<Integer>> groupedPairs = pairs.groupByKey();
JavaPairRDD<String, Iterable<Integer>> top3Score = groupedPairs.mapToPair(
new PairFunction<Tuple2<String,Iterable<Integer>>, String, Iterable<Integer>>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, Iterable<Integer>> call(
Tuple2<String, Iterable<Integer>> classScores)
throws Exception {
Integer[] top3 = new Integer[3];
String className = classScores._1;
Iterator<Integer> scores = classScores._2.iterator();
while(scores.hasNext()) {
Integer score = scores.next();
for(int i = 0; i < 3; i++) {
if(top3[i] == null) {
top3[i] = score;
break;
} else if(score > top3[i]) {
for(int j = 2; j > i; j--) {
top3[j] = top3[j - 1];
}
top3[i] = score;
break;
}
}
}
return new Tuple2<String,
Iterable<Integer>>(className, Arrays.asList(top3));
}
});
top3Score.foreach(new VoidFunction<Tuple2<String,Iterable<Integer>>>() {
private static final long serialVersionUID = 1L;
@Override
public void call(Tuple2<String, Iterable<Integer>> t) throws Exception {
System.out.println("class: " + t._1);
Iterator<Integer> scoreIterator = t._2.iterator();
while(scoreIterator.hasNext()) {
Integer score = scoreIterator.next();
System.out.println(score);
}
System.out.println("=======================================");
}
});
sc.close();
}
}
Scala版本:
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object Top3 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setAppName("Top3")
.setMaster("local")
val sc = new SparkContext(conf)
val lines = sc.textFile("C://Users//Administrator//Desktop//top.txt", 1)
val pairs = lines.map { line => (line.toInt, line) }
val sortedPairs = pairs.sortByKey(false)
val sortedNumbers = sortedPairs.map(sortedPair => sortedPair._1)
val top3Number = sortedNumbers.take(3)
for(num <- top3Number) {
println(num)
}
}
}
scala版本分组取topN:
import org.apache.spark.{SparkConf, SparkContext}
object GroupTopN {
def main(args: Array[String]){
val conf = new SparkConf().setAppName("groupTopN").setMaster("local")
val sc = new SparkContext(conf)
//数据在代码末尾
val baseRDD = sc.textFile("D:\\score.txt").cache()
val pairRDD = baseRDD.map(line => (line.split(" ")(0),line.split(" ")(1).toInt))
val groupRDD = pairRDD.groupByKey()
groupRDD.foreach(println(_))
val grouptop5RDD = groupRDD.map(line => {
val top5 = new Array[Int](5)
line._2.foreach(score =>{
var i = 0
var flag = true
for(i <- 0 until 5 if flag){
if(top5(i)<score){
var tmp = top5(i)
top5(i) = score
var j = 0
for(j <- i+1 until top5.size){
if(top5(j) < tmp){
var temp = top5(j)
top5(j) = tmp
tmp = temp
}
}
flag = false
}
}
})
(line._1,top5)
})
grouptop5RDD.foreach(line =>{
var i = 0
val top = line._2
println("================================")
for (i <- 0 until top.size){
println(line._1+" : "+top(i))
}
})
}
}
/*
class1 77
class3 67
class2 81
class3 73
class1 91
class2 69
class2 97
class3 90
class1 57
class3 55
class1 57
class2 81
class1 93
class3 79
class2 81
class1 99
class3 99
class1 91
class2 81
class1 99
class3 79
class2 81
*/