Spark基本排序原理
- 经典wordcount排序原理,单词个数降序
Java版BasicSort
public class BasicSort {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName(BasicSort.class.getSimpleName()).setMaster("local[2]");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> linesRDD = sc.textFile("E:/test/word.txt");
JavaRDD<String> wordsRDD= linesRDD.flatMap(x -> Arrays.asList( x.split(" ")));
JavaPairRDD<String, Integer> pairsRDD = wordsRDD.mapToPair(x -> new Tuple2<String, Integer>(x, 1));
JavaPairRDD<String, Integer> rwordsRDD = pairsRDD.reduceByKey((v1, v2) -> v1 + v2);
List<Tuple2<String, Integer>> collect = rwordsRDD.mapToPair(x -> new Tuple2<Integer, String>(x._2, x._1))
.sortByKey(false)
.map(x -> new Tuple2<String, Integer>(x._2, x._1))
.collect();
for (Tuple2<String,Integer> x: collect) {
System.out.println(x._1() + "---->" + x._2());
}
}
}
Scala版本BasicSort
object wordcount {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("wordcount").setMaster("local")
val sc = new SparkContext(conf)
val wcRDD = sc.textFile("E:/test/word.txt").flatMap(_.split(" "))
.map((_,1)).reduceByKey(_+_)
val collect = wcRDD.map( x =>(x._2,x._1)).sortByKey(false).map(x => (x._2,x._1)).collect
collect.foreach(x => println(x))
}
}
-Spark二次排序
Java版本 Spark二次排序
public class SecondSortApp {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName(SecondSortApp.class.getSimpleName()).setMaster("local[2]");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> lines = sc.textFile("E:/test/sort.txt");
List<Tuple2<SecondSort, String>> collect = lines.mapToPair(line -> new Tuple2<SecondSort, String>(new SecondSort(line.split(" ")[0], line.split(" ")[1]), line))
.sortByKey().collect();
for (Tuple2<SecondSort, String> t: collect) {
System.out.println(t._2());
}
}
}
class SecondSort implements Comparable<SecondSort>,Serializable{
private int first;
private int sencod;
public SecondSort(int first, int sencod) {
this.first = first;
this.sencod = sencod;
}
public SecondSort(String first,String second) {
this.first = Integer.valueOf(first.trim());
this.sencod = Integer.valueOf(second.trim());
}
public SecondSort() {
}
public int getFirst() {
return first;
}
public void setFirst(int first) {
this.first = first;
}
public int getSencod() {
return sencod;
}
public void setSencod(int sencod) {
this.sencod = sencod;
}
@Override
public int compareTo(SecondSort o) {
int ret = first -o.first;
if (ret == 0){
ret = o.sencod - sencod;
}
return ret;
}
}
Scala版本二次排序
object SecondSortAPP {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SecondSortAPP").setMaster("local[2]")
val sc = new SparkContext(conf)
val linesRDD = sc.textFile("E:/test/sort.txt")
val collect = linesRDD.map(line =>(new SecondSort(line.split(" ")(0),line.split(" ")(1)),line)).sortByKey().collect()
collect.foreach(x => println(x._2))
}
}
class SecondSort(val first:String,val second:String) extends Ordered[SecondSort] with Serializable{
def getFirst() = first
def getSecond() = second
override def compare(that: SecondSort): Int = {
var ret = first.compareTo(that.first)
if(ret == 0){
ret = second.compareTo(that.second)
}
ret
}
}
-Spark topN
Java版本Spark topN问题
public class TopN {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName(TopN.class.getName()).setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> linesRDD = sc.textFile("E:/test/topn.txt");
int topn = Integer.valueOf(args[0]);
Broadcast<Integer> topN = sc.broadcast(topn);
JavaPairRDD<String, Iterable<String>> result = linesRDD.mapToPair(line -> new Tuple2<String, String>(line.split(" ")[0], line.split(" ")[1]))
.groupByKey().mapToPair(x -> {
TreeSet<String> set = new TreeSet<String>(new Mycomparator() {
@Override
public int compare(String o1, String o2) {
int ret = o1.compareTo(o2);
if (ret == 0){
//不去重
ret = 1;
}
return ret;
}
});
for (String sorce : x._2()) {
set.add(sorce);
if (set.size() > topN.value()) {
set.pollLast();
}
}
return new Tuple2<String, Iterable<String>>(x._1, set);
}).sortByKey();
result.foreach(x -> System.out.println(x));
}
}
//因为对象序列化在这里没有用,比较器也需要序列化
interface Mycomparator extends Comparator<String>,Serializable{}
Scala版 Spark topN问题
object TopN {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("TopN").setMaster("local[2]")
val sc = new SparkContext(conf)
val linesRDD = sc.textFile("E:/test/topn.txt")
linesRDD.map(line => new Tuple2[String,String](line.split(" ")(0),line.split(" ")(1))).groupByKey()
.sortByKey().map(x => MyTopN(3,x)).foreach(x => println(x))
}
ing]]):Tuple2[String,Iterable[String]] ={
//特别注意,他妈逼scala 与 java TreeSet自定义比较器的时候,不一样
//Java直接从第一个括号传进去,Scala要另起一括号,操
var set = mutable.TreeSet[String]()(new Ordering[String]() {
override def compare(x: String, y: String): Int = {
var ret = x.compareTo(y)
if(ret == 0){
ret = 1
}
ret
}
})
for(s <-tuple._2){
set += s
if (set.size>topn){
set = set.take(topn)
}
}
new Tuple2[String,mutable.Iterable[String]](tuple._1,set)
}
}