1.导依赖包
同:https://blog.csdn.net/weixin_44449054/article/details/114223002
2.代码实现
package cn.twy;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.util.Arrays;
import java.util.Iterator;
/**
* java代码实现spark的WordCount
*/
public class WordCountJava {
public static void main(String[] args) {
//todo:1、构建sparkconf,设置配置信息
SparkConf sparkConf = new SparkConf().setAppName("WordCount_Java").setMaster("local[2]");
//todo:2、构建java版的sparkContext
JavaSparkContext sc = new JavaSparkContext(sparkConf);
//todo:3、读取数据文件
JavaRDD<String> dataRDD = sc.textFile("H:\\大数据实时资料\\wordcount.txt");
//todo:4、对每一行单词进行切分
JavaRDD<String> wordsRDD = dataRDD.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String s) throws Exception {
String[] words = s.split(" ");
return Arrays.asList(words).iterator();
}
});
//todo:5、给每个单词计为 1
// Spark为包含键值对类型的RDD提供了一些专有的操作。这些RDD被称为PairRDD。
// mapToPair函数会对一个RDD中的每个元素调用f函数,其中原来RDD中的每一个元素都是T类型的,
// 调用f函数后会进行一定的操作把每个元素都转换成一个<K2,V2>类型的对象,其中Tuple2为多元组
JavaPairRDD<String, Integer> wordAndOnePairRDD = wordsRDD.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String word) throws Exception {
return new Tuple2<String,Integer>(word, 1);
}
});
//todo:6、相同单词出现的次数累加
JavaPairRDD<String, Integer> resultJavaPairRDD = wordAndOnePairRDD.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
//todo:7、反转顺序
JavaPairRDD<Integer, String> reverseJavaPairRDD = resultJavaPairRDD.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() {
@Override
public Tuple2<Integer, String> call(Tuple2<String, Integer> tuple) throws Exception {
return new Tuple2<Integer, String>(tuple._2, tuple._1);
}
});
//todo:8、把每个单词出现的次数作为key,进行排序,并且在通过mapToPair进行反转顺序后输出
JavaPairRDD<String, Integer> sortJavaPairRDD = reverseJavaPairRDD.sortByKey(false).mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Tuple2<Integer, String> tuple) throws Exception {
return new Tuple2<String, Integer>(tuple._2,tuple._1);
//或者使用tuple.swap() 实现位置互换,生成新的tuple;
}
});
//todo:执行输出
System.out.println(sortJavaPairRDD.collect());
//todo:关闭sparkcontext
sc.stop();
}
}