1、案例需求
(1)对文本文件内的内阁单词都统计出其出现的次数
(2)按照每个单词出现次数的数量,降序排序
2、实战开发
- java版本
/**
* 排序的wordcount程序
*
*/
public class SortWordCount {
public static void main(String[] args) {
//创建SparkConf和JavaSparkContext
SparkConf conf = new SparkConf().setAppName("SortWordCount").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
//创建linesRDD
JavaRDD<String> lines = sc.textFile("./spark.txt");
//执行之前做过的单词计数,即不排序
JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
public Iterator<String> call(String t) throws Exception {
return Arrays.asList(t.split(" ")).iterator();
}
});
JavaPairRDD<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<String, Integer>(s, 1);
}
});
JavaPairRDD<String,Integer> wordCounts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
//到这里为止,就得到了每个单词出现的次数
//但新需求是按照每个单词出现次数的顺序降序排序
//wordCount RDD 内的元素是这种格式的:(hello,3) (you,2)
//需要将RDD转化成(3,hello)(2,you),才能根据单词出现的次数进行排序
//进行key-value的反转映射
JavaPairRDD<Integer, String> countWords = wordCounts.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() {
public Tuple2<Integer, String> call(Tuple2<String, Integer> t) throws Exception {
return new Tuple2<Integer, String>(t._2, t._1);
}
});
//按照key进行排序
JavaPairRDD<Integer, String> sortedCountWords = countWords.sortByKey(false);
//再次将value-key进行反转映射
JavaPairRDD<String, Integer> sortedWordsCount = sortedCountWords.mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() {
public Tuple2<String, Integer> call(Tuple2<Integer, String> t) throws Exception {
return new Tuple2<String, Integer>(t._2, t._1);
}
});
//到此为止,获得了按照单词出现次数排序后的单词技术
//打印结果
sortedWordsCount.foreach(new VoidFunction<Tuple2<String, Integer>>() {
public void call(Tuple2<String, Integer> t) throws Exception {
System.out.println(t._1 + " appears " + t._2 + " times " );
}
});
//关闭JavaSparkContext
sc.close();
}
}
- scala版本
object SortWordCount_scala {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SortWordCount_scala").setMaster("local")
val sc = new SparkContext(conf)
val lines = sc.textFile("./spark")
val words = lines.flatMap(line => line.split(" "))
val pairs = words.map(word => (word,1))
val wordCounts = pairs.reduceByKey(_ + _)
val countWords = wordCounts.map(wordCount => (wordCount._2,wordCount._1))
val sortedCountWords = countWords.sortByKey(false)
val sortedWordsCount = sortedCountWords.map(sortedCountWord => (sortedCountWord._2,sortedCountWord._1))
sortedWordsCount.foreach(sortedWordsCount => println(sortedWordsCount._1 + " appears " + sortedWordsCount._2 + "times"))
}
本文为北风网Spark2.0培训视频的学习笔记
视频链接:
https://www.bilibili.com/video/av19995678?p=53
228

被折叠的 条评论
为什么被折叠?



