Spark的多种wordcount写法


一、经典写法:SparkCore

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object sparkcore_wordcount {
    def main(args: Array[String]): Unit = {
        // 参数配置
        val conf = new SparkConf().setMaster("local[2]").setAppName(this.getClass.getSimpleName)
        val sc = new SparkContext(conf)
        sc.setLogLevel("WARN")
        // 加载文件
        val input: RDD[String] = sc.textFile("src/main/resources/words.txt", 2)
        // 返回数据
        val result: RDD[(String, Int)] = input.filter(line => line != null && line.trim.length > 0)
                // 按照空格切分
                .flatMap(_.split("\\s+"))
                .mapPartitions(iter => iter.map(_ -> 1))
                .reduceByKey(_ + _)
        result.coalesce(1).foreachPartition(iter => iter.foreach(println))
        sc.stop()
    }
}

二、文艺写法:SparkSession

import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}

object sparksession_wordcount {
    def main(args: Array[String]): Unit = {
        // 构建sparksession
        val session: SparkSession = SparkSession.builder()
                .master("local[2]")
                .appName(this.getClass.getSimpleName)
                .config("spark.sql.warehouse.dir","file:///")
                .getOrCreate()
        session.sparkContext.setLogLevel("ERROR")
        import session.implicits._
        // 读取文件
        val input: Dataset[String] = session.read.textFile("src/main/resources/words.txt")
        val output: DataFrame = input
                // 过滤不合格数据
                .filter(line => line != null && line.nonEmpty)
                // 分割数据
                .flatMap(line => line.split("\\s+"))
                // 按照value进行分组
                .groupBy("value")
                // 得到数量
                .count()
        output.show()
        session.stop()
    }
}

三、SQL写法:SparkSQL

import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}

object sparksql_wordcount {
    def main(args: Array[String]): Unit = {
        val session: SparkSession = SparkSession.builder()
                .master("local[2]")
                .appName(this.getClass.getSimpleName)
                .config("spark.sql.warehouse.dir","file:///")
                .getOrCreate()
        session.sparkContext.setLogLevel("WARN")
        import session.implicits._
        val input: Dataset[String] = session.read.textFile("src/main/resources/words.txt")
        // 使用ds的api
        val outDS: Dataset[String] = input.filter(line => line != null && line.trim.length > 0)
                .flatMap(line => line.split("\\s+"))
        outDS.createOrReplaceTempView("view")
        val resultDF:DataFrame = session.sql("select value , count(1) as count from view group by value order by count")
        resultDF.show()
    }
}

四、流式写法(2种)

运行代码之前,使用cmd,输入nc -lp 9999(代码种设置的端口号),输入字符即可测试。

4.1、SparkStreaming
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}

object sparkstreaming_wordcount {
    def main(args: Array[String]): Unit = {
        // 创建streamingcontext流式实例对象
        val ssc: StreamingContext = {
            // 创建conf对象
            val conf: SparkConf = new SparkConf()
                    .setMaster("local[2]")
                    .setAppName(this.getClass.getSimpleName)
            // 创建流式上下文对象,3秒一次
            val context: StreamingContext = new StreamingContext(conf, Seconds(3))
            context
        }
        ssc.sparkContext.setLogLevel("WARN")
        val inputDStream: ReceiverInputDStream[String] = ssc.socketTextStream("localhost", 9999)
        val resultDSteam: DStream[(String, Int)] = inputDStream.filter(line => line != null && line.length > 0)
                .flatMap(line => line.split("\\s+"))
                // 转换数据为元组,表示每个单词出现一次
                .map(word => (word, 1))
                // 按照单词分组进行聚合统计
                .reduceByKey((tmp, item) => tmp + item)
        resultDSteam.print()
        // 启动
        ssc.start()
        // 一直运行,除非出现问题
        ssc.awaitTermination()
        /*
        * 关闭流式应用
        * 1、SparkContext是否关闭
        * 2、是否优雅关闭
        * */
        ssc.stop(stopSparkContext = true, stopGracefully = true)

        // 打开cmd,使用nc -lp 9999进行测试
    }
}


4.2、StructuredStreaming
object structuredstreaming_wordcount {
    def main(args: Array[String]): Unit = {
        val session: SparkSession = SparkSession.builder()
                .master("local[2]")
                .appName(this.getClass.getSimpleName)
                .config("spark.sql.shuffle.partitions", "2")
                .config("spark.sql.warehouse.dir","file:///")
                .getOrCreate()
        session.sparkContext.setLogLevel("WARN")
        import session.implicits._

        val inputStreamDF: DataFrame = session.readStream
                .format("socket")
                .option("host", "localhost")
                .option("port", 9999)
                .load()

        val resultStreamDF: DataFrame = inputStreamDF.as[String]
                .filter(line => line != null && line.length > 0)
                .flatMap(_.split("\\s+"))
                .groupBy($"value")
                .count()

        val query:StreamingQuery=resultStreamDF.writeStream
                .outputMode(OutputMode.Complete())
                .format("console")
                .option("truncate","false")
                .start()

        //开始执行
        query.awaitTermination()
        query.stop()
    }
}
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值