楔子
spark统计单词
/**
* 单词统计
*
* @throws IOException
*/
public static void wc() throws IOException {
JavaSparkContext sc = SparkUtils.getContext();
JavaRDD<String> inputRDD = sc.parallelize(Arrays.asList("hello word", "hello spark"));
JavaRDD<String> words = inputRDD.flatMap(x -> Arrays.asList(x.split(" ")).iterator());
JavaPairRDD<String, Integer> groupByKey = words.mapToPair(w -> new Tuple2<String, Integer>(w, 1)).reduceByKey((x, y) -> x + y);
System.out.println(groupByKey.collect());
// [(spark,1), (word,1), (hello,2)]
}
下面的方法更快
对第一个RDD使用countByValue函数,以更快地实现单词统计
/**
* 单词统计
*
* @throws IOException
*/
public static void wc2() throws IOException {
JavaSparkContext sc = SparkUtils.getContext();
JavaRDD<String> inputRDD = sc.parallelize(Arrays.asList("hello word", "hello spark"));
Map<String, Long> words = inputRDD.flatMap(x -> Arrays.asList(x.split(" ")).iterator()).countByValue();
System.out.println(words);
// {spark=1, word=1, hello=2}
}