一.JavaAPI实现wordcount
代码实现:
public class JavaWordCount {
public static void main(String[] args){
if(args.length!=2){
System.out.println("Usage:JavaWordCount<input><output>");
System.exit(1);
}
SparkConf conf = new SparkConf();
conf.setMaster("local");
conf.setAppName(JavaWordCount.class.getSimpleName());
JavaSparkContext jsc = new JavaSparkContext(conf);
JavaRDD<String> line = jsc.textFile(args[0]);
//切割压平 flatMap() 两个参数,一个输入类型,一个输出类型
JavaRDD<String> jrdd1 = line.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String s) throws Exception {
//该方法的返回值类型是Iterator,需要把Array类型的结果转换为迭代器类型的
return Arrays.asList(s.split(" ")).iterator();
}
});
//和1组合成元组 mapToPair() 第一个参数,输入数据类型,第二个参数是元组的key类型,第三个参数是元组的value类型
JavaPairRDD<String, Integer> jrdd2 = jrdd1.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<String, Integer>(s, 1);
}
});
//分组聚合 reduceByKey() (a,b)=>a+b 第三个参数:返回值的类型
JavaPairRDD<String, Integer> result = jrdd2.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
//先在本地测试一下
/* result.foreach(new VoidFunction<Tuple2<String, Integer>>() {
@Override
public void call(Tuple2<String, Integer> tuple) throws Exception {
System.out.println(tuple);
}
});*/
//可以进行排序
JavaPairRDD<Integer, String> res1 = result.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() {
@Override
public Tuple2<Integer, String> call(Tuple2<String, Integer> t) throws Exception {
return t.swap();
}
});
//排序,默认是升序,如果需要降序,参数false
JavaPairRDD<Integer, String> res2 = res1.sortByKey(false);
JavaPairRDD<String, Integer> finalRes = res2.mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Tuple2<Integer, String> t) throws Exception {
return t.swap();
}
});
//保存
finalRes.saveAsTextFile(args[1]);
//释放资源
jsc.close();
}
}
二.JavaLambda 实现wordcount
代码实现:
public class JavaLambdaWordCount {
public static void main(String[] args){
if(args.length!=2){
System.out.println("Usage JavaLambdaWordCount<input><output>");
System.exit(1);
}
SparkConf conf = new SparkConf();
conf.setMaster("local");
conf.setAppName(JavaLambdaWordCount.class.getSimpleName());
JavaSparkContext jsc = new JavaSparkContext(conf);
//读取数据
JavaRDD<String> jrdd = jsc.textFile(args[0]);
//切割压平
JavaRDD<String> jrdd2 = jrdd.flatMap(t -> Arrays.asList(t.split(" ")).iterator());
//和1组合
JavaPairRDD<String, Integer> jprdd = jrdd2.mapToPair(t -> new Tuple2<String, Integer>(t, 1));
//分组聚合
JavaPairRDD<String, Integer> res = jprdd.reduceByKey((a, b) -> a + b);
//保存
res.saveAsTextFile(args[1]);
//释放资源
jsc.close();
}
}