一、经典写法:SparkCore
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object sparkcore_wordcount {
def main(args: Array[String]): Unit = {
// 参数配置
val conf = new SparkConf().setMaster("local[2]").setAppName(this.getClass.getSimpleName)
val sc = new SparkContext(conf)
sc.setLogLevel("WARN")
// 加载文件
val input: RDD[String] = sc.textFile("src/main/resources/words.txt", 2)
// 返回数据
val result: RDD[(String, Int)] = input.filter(line => line != null && line.trim.length > 0)
// 按照空格切分
.flatMap(_.split("\\s+"))
.mapPartitions(iter => iter.map(_ -> 1))
.reduceByKey(_ + _)
result.coalesce(1).foreachPartition(iter => iter.foreach(println))
sc.stop()
}
}
二、文艺写法:SparkSession
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
object sparksession_wordcount {
def main(args: Array[String]): Unit = {
// 构建sparksession
val session: SparkSession = SparkSession.builder()
.master("local[2]")
.appName(this.getClass.getSimpleName)
.config("spark.sql.warehouse.dir","file:///")
.getOrCreate()
session.sparkContext.setLogLevel("ERROR")
import session.implicits._
// 读取文件
val input: Dataset[String] = session.read.textFile("src/main/resources/words.txt")
val output: DataFrame = input
// 过滤不合格数据
.filter(line => line != null && line.nonEmpty)
// 分割数据
.flatMap(line => line.split("\\s+"))
// 按照value进行分组
.groupBy("value")
// 得到数量
.count()
output.show()
session.stop()
}
}
三、SQL写法:SparkSQL
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
object sparksql_wordcount {
def main(args: Array[String]): Unit = {
val session: SparkSession = SparkSession.builder()
.master("local[2]")
.appName(this.getClass.getSimpleName)
.config("spark.sql.warehouse.dir","file:///")
.getOrCreate()
session.sparkContext.setLogLevel("WARN")
import session.implicits._
val input: Dataset[String] = session.read.textFile("src/main/resources/words.txt")
// 使用ds的api
val outDS: Dataset[String] = input.filter(line => line != null && line.trim.length > 0)
.flatMap(line => line.split("\\s+"))
outDS.createOrReplaceTempView("view")
val resultDF:DataFrame = session.sql("select value , count(1) as count from view group by value order by count")
resultDF.show()
}
}
四、流式写法(2种)
运行代码之前,使用cmd,输入nc -lp 9999(代码种设置的端口号),输入字符即可测试。
4.1、SparkStreaming
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
object sparkstreaming_wordcount {
def main(args: Array[String]): Unit = {
// 创建streamingcontext流式实例对象
val ssc: StreamingContext = {
// 创建conf对象
val conf: SparkConf = new SparkConf()
.setMaster("local[2]")
.setAppName(this.getClass.getSimpleName)
// 创建流式上下文对象,3秒一次
val context: StreamingContext = new StreamingContext(conf, Seconds(3))
context
}
ssc.sparkContext.setLogLevel("WARN")
val inputDStream: ReceiverInputDStream[String] = ssc.socketTextStream("localhost", 9999)
val resultDSteam: DStream[(String, Int)] = inputDStream.filter(line => line != null && line.length > 0)
.flatMap(line => line.split("\\s+"))
// 转换数据为元组,表示每个单词出现一次
.map(word => (word, 1))
// 按照单词分组进行聚合统计
.reduceByKey((tmp, item) => tmp + item)
resultDSteam.print()
// 启动
ssc.start()
// 一直运行,除非出现问题
ssc.awaitTermination()
/*
* 关闭流式应用
* 1、SparkContext是否关闭
* 2、是否优雅关闭
* */
ssc.stop(stopSparkContext = true, stopGracefully = true)
// 打开cmd,使用nc -lp 9999进行测试
}
}
4.2、StructuredStreaming
object structuredstreaming_wordcount {
def main(args: Array[String]): Unit = {
val session: SparkSession = SparkSession.builder()
.master("local[2]")
.appName(this.getClass.getSimpleName)
.config("spark.sql.shuffle.partitions", "2")
.config("spark.sql.warehouse.dir","file:///")
.getOrCreate()
session.sparkContext.setLogLevel("WARN")
import session.implicits._
val inputStreamDF: DataFrame = session.readStream
.format("socket")
.option("host", "localhost")
.option("port", 9999)
.load()
val resultStreamDF: DataFrame = inputStreamDF.as[String]
.filter(line => line != null && line.length > 0)
.flatMap(_.split("\\s+"))
.groupBy($"value")
.count()
val query:StreamingQuery=resultStreamDF.writeStream
.outputMode(OutputMode.Complete())
.format("console")
.option("truncate","false")
.start()
//开始执行
query.awaitTermination()
query.stop()
}
}