import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
import java.util.Arrays;
/**
* spark读取本地文件进行单词个数计算
*/
public class SparkWc{
public static void main(String [] args){
//本地测试,一定要设置setMaster为local
SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("wc");
JavaSparkContext sc = new JavaSparkContext(sparkConf);
JavaRDD<String> lines = sc.textFile("D:\\a.txt");
//从文件读取一行分隔
JavaRDD<String> words = lines.flatMap(line -> Arrays.asList(line.split(" ")).iterator());
JavaPairRDD<String,Integer> pairRDD = words.mapToPair(word -> new Tuple2<String,Integer>(word,1));
JavaPairRDD<String,Integer> pairRDD1 = pairRDD.reduceByKey((value1,value2) -> value1 + value2);
//注意传递元组作为参数的时候,一定要加上括号
JavaPairRDD<Integer,String> pairRDD2 = pairRDD1.mapToPair((Tuple2<String,Integer> tuple2) ->
new Tuple2<Integer,String>(Integer.valueOf(tuple2._2.toString()),String.valueOf(tuple2._1)));
JavaPairRDD<String,Integer> pairRDD3 = pairRDD2.sortByKey(false).mapToPair((Tuple2<Integer,String> tuple2) ->
new Tuple2<String,Integer>(tuple2._2.toString(),Integer.valueOf(tuple2._1.toString())));
pairRDD3.foreach(wc -> System.out.println(wc));
//将结果写入b文件夹
pairRDD3.saveAsTextFile("D:\\b");
sc.close();
}
}
maven配置:
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.0.2</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.0.2</version>
</dependency>