Spark-WordCount 之java写法
1.原始写法
package com.spark.day01;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.util.Arrays;
import java.util.Iterator;
/**
* Spark编程之国际惯例
* Spark编程的入口:SparkContext
* 不同方式的编程,SparkContext略有差异:
* 在SparkCore编程中:
* java版本的SparkContext:JavaSparkContext
* scala版本的SparkContext:SparkContext
* 在SparkSQL编程中:
* SQLContext/HiveContext
* 或者现在比较流行的方式:SparkSession
* 在SparkStreaming中:
* StreamingContext
* 而SparkContext的构建,需要依赖SparkConf
* 过程:
* 1、构建编程入口SparkContext
* 需要依赖SparkConf
* 指定相关参数信息
* 2、加载外部数据
* 生成操作的编程模型---RDD
* 3、基于RDD执行各种transformation操作
* 业务逻辑
* 4、作业的执行需要Action去触发
* 5、释放资源
*/
public class _01WordCountJavaApp {
public static void main(String[] args) {
//构建编程如口SparkContext
SparkConf conf=new SparkConf();
conf.setMaster("local[*]");
conf.setAppName(_01WordCountJavaApp.class.getSimpleName());
JavaSparkContext jsc = new JavaSparkContext(conf);
//加载外部数据
JavaRDD<String> lines = jsc.textFile("D:\\aaa\\word.txt");
//transformation
JavaRDD<String> wordRDD = lines.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String lines) throws Exception {
String[] split = lines.split("\\.");
Iterator<String> it = Arrays.asList(split).iterator();
return it;
}
});
JavaPairRDD<String, Integer> pairsRDD = wordRDD.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String word) throws Exception {
return new Tuple2<>(word, 1);
}
});
JavaPairRDD<String, Integer> retRDD = pairsRDD.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
//action触发作业
retRDD.foreach(new VoidFunction<Tuple2<String, Integer>>() {
@Override
public void call(Tuple2<String, Integer> t) throws Exception {
System.out.println(t._1 + "--->" + t._2);
}
});
//释放资源
jsc.stop();
}
}
上面的代码是使用就java的方式进行编写,代码中大量使用了匿名内部类的方法.此种代码比较繁杂,可以使用lamda表达式进行优化
2.Lambda表达式方法进行优化
package com.spark.day01;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.util.Arrays;
import java.util.Iterator;
/**
* 下边对上边的代码进行了优化,注释掉的部分就是原来优化之前的代码,可以比对学习,理解lambda表达式
*/
public class _02WordCountJavaLamdaApp {
public static void main(String[] args) {
//构建编程如口SparkContext
SparkConf conf=new SparkConf();
conf.setMaster("local[*]");
conf.setAppName(_02WordCountJavaLamdaApp.class.getSimpleName());
JavaSparkContext jsc = new JavaSparkContext(conf);
//加载外部数据
JavaRDD<String> lines = jsc.textFile("D:\\aaa\\word.txt");
/* //transformation
JavaRDD<String> wordRDD = lines.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String lines) throws Exception {
String[] split = lines.split("\\.");
Iterator<String> it = Arrays.asList(split).iterator();
return it;
}
});*/
JavaRDD<String> wordRDD = lines.flatMap(line -> Arrays.asList(line.split("\\.")).iterator());
/* JavaPairRDD<String, Integer> pairsRDD = wordRDD.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String word) throws Exception {
return new Tuple2<>(word, 1);
}
});*/
JavaPairRDD<String, Integer> pairsRDD = wordRDD.mapToPair(word -> new Tuple2<>(word, 1));
/* JavaPairRDD<String, Integer> retRDD = pairsRDD.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});*/
JavaPairRDD<String, Integer> retRDD = pairsRDD.reduceByKey((x, y) -> x + y);
//action触发作业
/* retRDD.foreach(new VoidFunction<Tuple2<String, Integer>>() {
@Override
public void call(Tuple2<String, Integer> t) throws Exception {
System.out.println(t._1 + "--->" + t._2);
}
});*/
retRDD.foreach(t-> System.out.println(t._1+"------------>"+t._2));
//释放资源
jsc.stop();
}
}
运行结果
Map----------->1
net----------->2
util----------->5
import----------->10
BufferedReader----------->1
IOException----------->1
java----------->10
FileReader----------->1
io----------->3
ArrayList----------->1
URISyntaxException----------->1
Arrays----------->1
HashMap----------->1
URI----------->1
List----------->1