package spark;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
/*
统计每行单词出现的次数
*/
public class LineCount {
public static void main(String[] args) {
//创建SparkConf
SparkConf conf = new SparkConf()
.setAppName("LineCount")
.setMaster("local");
//创建javaSparkContext
JavaSparkContext sc = new JavaSparkContext(conf);
//创建初始RDD,lines,每个元素是一行文本
JavaRDD<String> lines = sc.textFile("E:\\hello.txt");
JavaPairRDD<String, Integer> pairs = lines.mapToPair(
new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String t) throws Exception {
return new Tuple2<String, Integer>(t,1);
}
});
//对pairsRDD执行reduceByKey算子,统计出每一行出现的总次数
JavaPairRDD<String,Integer> lineCounts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
//执行一个action操作,foreach,打印出,每一行出现的次数
lineCounts.foreach(new VoidFunction<Tuple2<String, Integer>>() {
@Override
public void call(Tuple2<String, Integer> t) throws Exception {
System.out.println(t._1+ "出现" + t._2 + " 次");
}
});
//关闭JavaSparkContext
sc.close();
}
}
Java maven开发spark 统计文本中每行单词出现的次数
最新推荐文章于 2021-11-15 12:35:54 发布