目录
Spark Scala
package com.doit.Spark.day01
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object SparkRDD {
def main(args: Array[String]): Unit = {
//神奇的大集合
val conf: SparkConf = new SparkConf().setAppName("WordCount")
//创建SparkContext
val sc: SparkContext = new SparkContext(conf)
//创建RDD
val lines: RDD[String] = sc.textFile(args(0))
//调用RDD的Transformation()方法
//调用Transformation转换算子后悔生成一个新的RDD
//切分压平 Transformation开始
val words: RDD[String] = lines.flatMap(_.split(" "))
//将单词和1 组合
val wordAndOne: RDD[(String, Int)] = words.map((_, 1))
//按照key进行聚合,分组聚合,reduceByKey 可以先局部聚合,然后再全局聚合
val reduces: RDD[(String, Int)] = wordAndOne.reduceByKey(_ + _) //key相同的value相加,key不变
//按照出现次数进行排序
val sorted: RDD[(String, Int)] = reduces.sortBy(_._2, false)
//Transformation结束,Transformation的代码都是在Action开始执行
//写出到hdfs
//调用Action行动算子
sorted.saveAsTextFile(args(1))
//释放资源
sc.stop();
}
}
------------------------------------------------------------------------------------------
import org.apache.spark.{SparkConf, SparkContext}
object OneLineWordCount {
def main(args: Array[String]): Unit = {
val sc: SparkContext = new SparkContext(new SparkConf().setAppName("WordCount"))
sc
.textFile(args(0))
.flatMap(_.split(" "))
.map((_, 1))
.reduceByKey(_ + _)
.sortBy(_._2, false)
.saveAsTextFile(args(1))
//释放资源
sc.stop();
}
}
Java
package com.doit.Spark.day01;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.util.Arrays;
import java.util.Iterator;
/**
* @Classname SparkRdd
* @Date 2020/11/22 17:42
* @Created by BaiYE
* @Description
*/
public class SparkRdd {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("WordCount");
/**
*创建JavaSparkContext
*/
JavaSparkContext jsc = new JavaSparkContext(conf);
//创建JavaRDD
JavaRDD<String> lines = jsc.textFile(args[0]);
//调用Transformation
//切割字符串
JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String line) throws Exception {
return Arrays.asList(line.split(" ")).iterator(); //将一行切分成多行,然后放进list中,再转成iterator,数组无法转iterator
}
});
//单词和一拼接,调用mapToPair
JavaPairRDD<String, Integer> wordAndOne = words.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return Tuple2.apply(s, 1);
}
});
//分组聚合
JavaPairRDD<String, Integer> reduced = wordAndOne.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
//排序
//java只能对key进行排序,所以需要将key和value换位置
JavaPairRDD<Integer, String> swapped = reduced.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() {
@Override
public Tuple2<Integer, String> call(Tuple2<String, Integer> tp) throws Exception {
// return Tuple2.apply(tp._2,tp._1);
return tp.swap();
}
});
/**
* 按照key进行降序排序
*/
JavaPairRDD<Integer, String> sorted = swapped.sortByKey(false);
//再将key和value的位置换回来
JavaPairRDD<String, Integer> resault = sorted.mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Tuple2<Integer, String> tp) throws Exception {
return tp.swap();
}
});
//写出,位置由main方法传入
resault.saveAsTextFile(args[1]);
//释放资源
jsc.stop();
}
}
Java_Lambda
package com.doit.Spark.day01;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
import java.util.Arrays;
/**
* @Classname lambda
* @Date 2020/11/22 22:09
* @Created by BaiYE
* @Description
*/
public class lambdaRdd {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("WordCount");
//创建JavaContext
JavaSparkContext jsc = new JavaSparkContext(conf);
//创建JavaRDD,指定文件读取位置,由main方法传入
JavaRDD<String> lines = jsc.textFile(args[0]);
//切割字符串,放入list中,并转成iterator
JavaRDD<String> word = lines.flatMap(line -> Arrays.asList(line.split(" ")).iterator());
//将单词和一组合
JavaPairRDD<String, Integer> wordAndOne = word.mapToPair(w -> Tuple2.apply(w, 1));
//分组聚合
JavaPairRDD<String, Integer> reduced = wordAndOne.reduceByKey((a, b) -> a + b);
//排序,Java只能对key进行排序,需要调换key和value的位置
JavaPairRDD<Integer, String> swapped = reduced.mapToPair(tp -> tp.swap());
//按照单词出现次数降序排序
JavaPairRDD<Integer, String> sorted = swapped.sortByKey(false);
//排完序后将key和value位置再换回来
JavaPairRDD<String, Integer> result = sorted.mapToPair(tp -> tp.swap());
//写出,位置由main方法传入
result.saveAsTextFile(args[1]);
//释放资源
jsc.stop();
}
}
提交
将代码打成.jar包,提交到任意一台有Spark的机器,运行即可
[root@linux01 ~]# /opt/apps/spark-3.0.1-bin-hadoop3.2/bin/spark-submit --master spark://linux01:7077 --executor-memory 1g --total-executor-cores 4 -class com.doit.Spark.day01.lambdaRdd /root/SparkTest-1.0-SNAPSHOT.jar hdfs://linux01:8020/word.txt hdfs:/linux01:8020/output/