集群运行
pom依赖
<!-- 定义了一些常量 -->
<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<scala.version>2.12.10</scala.version>
<spark.version>3.0.0</spark.version>
<encoding>UTF-8</encoding>
</properties>
<dependencies>
<!-- 导入scala的依赖 -->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
<!-- 打包时不会将依赖打入jar包 -->
<!-- <scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>${spark.version}</version>
<!--<scope>provided</scope>-->
</dependency>
</dependencies>
scala版
package test01
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object WordCount {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("WordCount")
val sc = new SparkContext(conf)
val lines: RDD[String] = sc.textFile(args(0)) //输入文件位置
val words: RDD[String] = lines.flatMap(_.split(" "))
val wordAndOne: RDD[(String, Int)] = words.map((_,1))
val reduce: RDD[(String, Int)] = wordAndOne.reduceByKey(_+_)
val sorted = reduce.sortBy(_._2,false) //按单词个数降序
sorted.saveAsTextFile(args(1)) //输出文件位置
sc.stop()
}
}
java版
package test01;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
import java.util.Arrays;
public class JavaWordCount {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("javaWordCount");
JavaSparkContext jsc = new JavaSparkContext(conf);
JavaRDD<String> lines = jsc.textFile(args[0]);
JavaRDD<String> words = lines.flatMap(x -> Arrays.stream(x.split(" ")).iterator());
//使用javaApI中的mapToPair算子,方便后面的操作
JavaPairRDD<String, Integer> wordAndOne = words.mapToPair(x -> Tuple2.apply(x, 1));
JavaPairRDD<String, Integer> redeced = wordAndOne.reduceByKey((x, y) -> x + y);
//反转单词和个数后排序
JavaPairRDD<String, Integer> sorted = redeced.mapToPair(x -> x.swap()).sortByKey(false).mapToPair(x -> x.swap());
sorted.saveAsTextFile(args[1]);
}
}
集群上运行,执行命令
/opt/app/spark-3.0.0-bin-hadoop2.7/bin/spark-submit --master spark://linux01:7077 --executor-memory 1g --total-executor-cores 4 --class test01.WordCount /test/spark-in-action-1.0-SNAPSHOT.jar hdfs://linux01:9000/test/word hdfs://linux01:9000/test/out0
--master spark://linux01:7077 指定master
--executor-memory 1g 指定每个executor内存
--total-executor-cores 4 指定总共executor核数
--class test01.WordCount 指定运行主类
后面跟上jar包位置和程序运行参数
本地运行
SparkConf conf = new SparkConf().setAppName("javaWordCount");
conf.setMaster("local[*]"); //设置本地模式
System.setProperty("HADOOP_USER_NAME", "root"); //设置访问HDFS用户
JavaSparkContext jsc = new JavaSparkContext(conf);
......
读取hdfs中的数据路径:hdfs://linux01:9000/test/word hdfs://linux01:9000/test/out1
默认分区数 = total-cores
默认分区数 = 输入切片+n(如果其中一个或多个文件偏大,n和最小分区数有关,默认2)