public class WordCount { public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("wordcount") .setMaster("local[*]"); JavaSparkContext sc = new JavaSparkContext(conf); Configuration hadoopConf = sc.hadoopConfiguration(); hadoopConf.set("mapreduce.output.fileoutputformat.compress", "false"); JavaRDD<String> input = sc.textFile(args[0]); JavaRDD<String> words = input.flatMap(f->{ return Arrays.asList(f.split(" ")).iterator(); }); JavaPairRDD<String, Integer> counts = words.mapToPair(f->{ return new Tuple2<String, Integer>(f, 1); }).reduceByKey((x,y)->{ return x+y; }); } }
或者:
JavaPairRDD<String, Integer> flattenPairs =inputData.flatMapToPair(text -> Arrays.asList(text.split("
")).stream().map(word ->new Tuple2<String,Integer>(word,1)).iterator());
JavaPairRDD<String, Integer> wordCountRDD = flattenPairs.reduceByKey((v1,v2) -> v1+v2);
wordCountRDD.saveAsTextFile("path_of_output_file");
1.初始化SparkContext
SparkConf conf = new SparkConf().setAppName("wordcount") .setMaster("local[*]");
Spark应用程序,只能包含一个SparkConf;
2.初始化JavaSparkContext
JavaSparkContext sc = new JavaSparkContext(conf);
3.加载输入数据
JavaRDD<String> input = sc.textFile(args[0]);
4.过滤输入数据和创建tuples
JavaPairRDD<String, Integer> flattenPairs =inputData.flatMapToPair(text -> Arrays.asList(text.split("
")).stream().map(word ->new Tuple2<String,Integer>(word,1)).iterator());
5.对每个key的value进行聚合求和
flattenPairs.reduceByKey((v1,v2)-> v1 + v2);
6.保存RDD
wordCountRDD.saveAsTextFile("path_of_output_file");