构建Sparkwindows环境
配置windowsHadoop
解压到本地磁盘,配置环境变量 bin目录和sbin目录
构建Maven配置pom.xml(学习用)
<artifactId>spark-core</artifactId>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<scala.version>2.12.0</scala.version>
<hadoop.version>2.7.7</hadoop.version>
<spark.version>2.4.7</spark.version>
</properties>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>2.4.7</version>
</dependency>
<!--Scala-->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<!--Spark-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.12</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.47</version>
</dependency>
<!--Hadoop-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.google.code.gson/gson
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.0</version>
</dependency>
<!– https://mvnrepository.com/artifact/org.apache.kafka/kafka –>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.11</artifactId>
<version>1.0.0</version>
</dependency>-->
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-mllib -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_2.12</artifactId>
<version>${spark.version}</version>
</dependency>
</dependencies>
<build>
<sourceDirectory>src/main/scala</sourceDirectory>
<testSourceDirectory>src/test/scala</testSourceDirectory>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
<configuration>
<args>
<arg>-dependencyfile</arg>
<arg>${project.build.directory}/.scala_dependencies</arg>
</args>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
快速上手WordCount
测试是否环境正常
object wc {
def main(args: Array[String]): Unit = {
val sparConf = new SparkConf().setMaster("local").setAppName("wordcount")
val context = new SparkContext(sparConf)
context.stop()
}
}
功能实现
object wc {
def main(args: Array[String]): Unit = {
//建立连接
val sparConf = new SparkConf().setMaster("local").setAppName("wordcount")
val context = new SparkContext(sparConf)
//读取数据
val value: RDD[String] = context.textFile("data")
//扁平映射,将每一行数据铲粪成一个一个单词
val word: RDD[String] = value.flatMap(_.split(" "))
//根据单词分组
val wordGroup: RDD[(String, Iterable[String])] = word.groupBy(word => word)
//结构转换
val wordToCount: RDD[(String, Int)] = wordGroup.map {
case (word, list) =>(word, list.size)
}
//采集转化结果
val tuples: Array[(String, Int)] = wordToCount.collect()
tuples.foreach(println)
context.stop()
}
}
第二种
object wc2 {
def main(args: Array[String]): Unit = {
//建立连接
val sparConf = new SparkConf().setMaster("local").setAppName("wordcount")
val context = new SparkContext(sparConf)
//读取数据
val value: RDD[String] = context.textFile("data")
//扁平映射,将每一行数据铲粪成一个一个单词
val word: RDD[String] = value.flatMap(_.split(" "))
//结构转化
val wordM: RDD[(String, Int)] = word.map(i => (i, 1))
val wordGroup: RDD[(String, Iterable[(Strin