前提
这里假设已经完成了scala的开发环境配置。
步骤
项目结构
build.sbt
project/
src/
-- main/
|-- java/
|-- resources/
|-- scala/
|-- test/
|-- java/
|-- resources/
|-- scala/
target/
创建sbt的scala基础项目
mkdir hello-world
cd hello-world
mkdir -p src/{main,test}/{java,resources,scala}
mkdir project target
build.sbt
创建build.sbt文件:
scalaVersion := "2.12.16"
name := "hello-world"
organization := "ch.epfl.scala"
version := "1.0"
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-sql" % "3.2.1"
)
Main.scala
创建src/main/scala/Main.scala
:
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
object Main extends App {
println("Hello, World!")
// 这里的下划线"_"是占位符,代表数据文件的根目录
val rootPath: String = "."
val file: String = s"$rootPath/wikiOfSpark.txt"
val conf = new SparkConf()
.setMaster("local[2]")
.setAppName("Spark SQL basic example")
var spark: SparkSession = _
try {
spark = SparkSession
.builder()
.config(conf)
.getOrCreate()
// 读取文件内容
val lineRDD: RDD[String] = spark.sparkContext.textFile(file)
lineRDD.take(10).foreach(println)
// 以行为单位做分词
val wordRDD: RDD[String] = lineRDD.flatMap(line => line.split(" "))
wordRDD.take(200).foreach(println)
// 过滤掉空字符串
val cleanWordRDD: RDD[String] = wordRDD.filter(word => !word.equals(""))
cleanWordRDD.take(200).foreach(println)
// 把RDD元素转换为(Key,Value)的形式
val kvRDD: RDD[(String, Int)] = cleanWordRDD.map(word => (word, 1))
kvRDD.take(10).foreach(println)
// 按照单词做分组计数
val wordCounts: RDD[(String, Int)] = kvRDD.reduceByKey((x, y) => x + y)
wordCounts.take(10).foreach(println)
// 打印词频最高的5个词汇
val top5: Array[(Int, String)] = wordCounts.map { case (k, v) => (v, k) }.sortByKey(ascending = false).take(5)
println(top5.mkString("Array(", ", ", ")"))
} finally {
if (spark != null) {
spark.close()
}
}
}
运行sbt run
命令,得到如下结果:
Array((67,the), (63,Spark), (54,a), (51,and), (50,of))
搞完。
源代码:https://github.com/fxtxz2/spark-word-count