1. Scala代码
在scala目录下创建包com.sanqian.scala
再创建一个Scala object:WordCountScala
package com.sanqian.scala
import org.apache.spark.{SparkConf, SparkContext}
object WorldCountScala {
def main(args: Array[String]): Unit = {
//第一步创建SparkContext
val conf = new SparkConf()
conf.setAppName("WorldCountScala")
// .setMaster("local")
val sc = new SparkContext(conf)
//第二步:加载数据
var path = "D:\\data\\wordcount.txt"
if (args.length== 1){
path = args(0)
}
val linesRDD = sc.textFile(path)
// 第三步:对数据进行切割,把一行数据切分成一个一个的单词
val wordsRDD = linesRDD.flatMap(_.split(" "))
// 第四步: 迭代words,将每个word转化为(word, 1)这种形式
val pairsRDD = wordsRDD.map((_, 1))
//第五步:根据key(其实就是word)进行分组聚合统计
val wordCountRDD = pairsRDD.reduceByKey(_ + _)
//第六步: 将结果打印到控制台
wordCountRDD.foreach(wordCount => println(wordCount._1 + "---" + wordCount._2))
//第七步:停止SparkContext
sc.stop()
}
}
2. Java代码
在scala目录下创建包com.sanqian.java
再创建一个Java类:WordCountJava
package com.sanqian.java;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.util.Arrays;
import java.util.Iterator;
public class WordCountJava {
public static void main(String[] args) {
//第一步:创建sparkContext
// 注意:针对java代码需要获取JavaSparkContext
SparkConf conf = new SparkConf();
conf.setAppName("WordCountJava");
// .setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
//第二步:加载数据
String path = "D:\\data\\wordCount.txt";
if (args.length == 1){
path = args[0];
}
JavaRDD<String> linesRDD = sc.textFile(path);
//第三步:对数据进行切割,把一行数据切分成一个个的单词
// 第一个参数表示收入类型,第二个参数表示输出类型
JavaRDD<String> wordsRDD = linesRDD.flatMap(new FlatMapFunction<String, String>() {
public Iterator<String> call(String line) throws Exception {
return Arrays.asList(line.split(" ")).iterator();
}
});
//第四步: 迭代words
// 第一个参数是输入数据类型,第二个是输出tuple中的第一个参数类型,第三个是输出tuple中的第二个参数类型
JavaPairRDD<String, Integer> pairRDD = wordsRDD.mapToPair(new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(String word) throws Exception {
return new Tuple2<String, Integer>(word, 1);
}
});
//第五步:根据key(其实就是word)进行分组聚合统计
JavaPairRDD<String, Integer> wordCountRDD = pairRDD.reduceByKey(new Function2<Integer, Integer, Integer>() {
public Integer call(Integer integer, Integer integer2) throws Exception {
return integer + integer2;
}
});
//第六步:将结果打印到控制台
wordCountRDD.foreach(new VoidFunction<Tuple2<String, Integer>>() {
public void call(Tuple2<String, Integer> tup) throws Exception {
System.out.println(tup._1 + "---" + tup._2);
}
});
//第七步:停止SparkContext
sc.stop();
}
}
3. 打包运行
需要java编译插件、Scala编译插件、打包插件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.sanqian</groupId>
<artifactId>db_spark</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.4.3</version>
<scope>provided</scope>
</dependency>
</dependencies>
<build>
<plugins>
<!-- java编译插件 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<!-- scala编译插件 -->
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.1.6</version>
<configuration>
<scalaCompatVersion>2.11</scalaCompatVersion>
<scalaVersion>2.11.12</scalaVersion>
</configuration>
<executions>
<execution>
<id>compile-scala</id>
<phase>compile</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>test-compile-scala</id>
<phase>test-compile</phase>
<goals>
<goal>add-source</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<!-- 打包插件 -->
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<archive>
<manifest>
<mainClass></mainClass>
</manifest>
</archive>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
打包代码:
mvn clean package -DskipTests
注意:
然后把spark-core依赖的作用域设置为provided,不需要把相关依赖打入jar包
提交脚本: lwx_run.sh
spark-submit \
--class $1 \
--master yarn \
--deploy-mode client \
--executor-memory 1G \
--num-executors 1 \
db_spark-1.0-SNAPSHOT.jar \
$2
提交任务:
sh lwx_run.sh com.sanqian.java.WordCountJava /words.txt
2458

被折叠的 条评论
为什么被折叠?



