环境搭建,网上资源很多,不在赘述。
三种方式:
1、使用scala语言实现,比较简洁。
package main.scala.com.thy.spark
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object ScalaWordCount {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("ScalaWordCount").setMaster("local[3]")
val sc = new SparkContext(conf)
//读取文件一步到位,可读性差
// sc.textFile("E:\\hdfsDemo\\logs\\accesslog").flatMap(_.split(" "))
// .map((_,1)).reduceByKey(_+_).sortBy(_._2,false).saveAsTextFile("E:\\hdfsDemo\\logs\\out")
// sc.stop()
//读取文件
val lines = sc.textFile("E:\\hdfsDemo\\logs\\accesslog")
//将文件中的数据按照空格切分,并压平
val words = lines.flatMap(_.split(" "))
//将切分后的word组合成(word,1)形式的元组
val wordAndOne = words.map((_,1))
//将word,1进行聚合
val reduced = wordAndOne.reduceByKey(_+_)
//聚合后进行排序,按照元组的第二个元素排序
val sort = reduced.sortBy(_._2)
//输出结果
sort.saveAsTextFile("E:\\hdfsDemo\\logs\\out")
//释放资源
sc.stop()
}
}
2、第二种方式java实现,比较繁杂。。。
public class JavaWordCount {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("JavaWordCount").setMaster("local[2]");
JavaSparkContext jsc = new JavaSparkContext(conf);
//使用jsc把文件读进来
JavaRDD<String> lines = jsc.textFile("E:\\hdfsDemo\\input");
//将数据按照,切分,并压平。因为要返回迭代器,所以进行转换
JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String line) throws Exception {
return Arrays.asList(line.split(",")).iterator();
}
});
//将单词和1进行组合,需要返回(word,1)元组形式,要实现匿名接口
JavaPairRDD<String, Integer> wordAndOne = words.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String word) throws Exception {
return new Tuple2<>(word, 1);
}
});
//将单词出现的次数进行聚合
JavaPairRDD<String, Integer> reduced = wordAndOne.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
//因为排序只能根据key,所以先把元组进行反转后排序
JavaPairRDD<Integer, String> swap = reduced.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() {
@Override
public Tuple2<Integer, String> call(Tuple2<String, Integer> tp) throws Exception {
return tp.swap();
}
});
//将反转后的元组根据key排序
JavaPairRDD<Integer, String> sorted = swap.sortByKey(false);
//反转成(单词,出现次数)的形式
JavaPairRDD<String, Integer> result = sorted.mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Tuple2<Integer, String> tp) throws Exception {
return tp.swap();
}
});
//将结果输出
result.saveAsTextFile("E:\\hdfsDemo\\out");
}
}
3、第三种方式,是将java1.8中的新特性lamda运用其中,比较简洁
public class JavaWordCountLamda {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("JavaWordCountLamda").setMaster("local[2]");
JavaSparkContext jsc = new JavaSparkContext(conf);
JavaRDD<String> lines = jsc.textFile("E:\\hdfsDemo\\input");
//切分压平,注意必须转成iterator形式
JavaRDD<String> words = lines.flatMap(line -> Arrays.asList(line.split(",")).iterator());
//组成word,1形式
JavaPairRDD<String, Integer> wordAndOne = words.mapToPair(word -> new Tuple2<>(word, 1));
//聚合操作
JavaPairRDD<String, Integer> reduced = wordAndOne.reduceByKey((m, n) -> m + n);
//因为只能按key排序,先反转元组中的元素
JavaPairRDD<Integer, String> swap = reduced.mapToPair(tp -> tp.swap());
//排序
JavaPairRDD<Integer, String> sorted = swap.sortByKey(false);
//在反转回来
JavaPairRDD<String,Integer> res = sorted.mapToPair(tp -> tp.swap());
res.saveAsTextFile("E:\\hdfsDemo\\out");
jsc.stop();
}
}