spark streaming - scala统计hdfs

本文章主要通过spark streaming实现hdfs文件的统计

import org.apache.spark.SparkConf
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.Seconds

/**
 * @author jhp
  *         spark streaming读取hdfs文件
 */
object HDFSWordCount {
  
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
        .setMaster("local[2]")  
        .setAppName("HDFSWordCount")
    val ssc = new StreamingContext(conf, Seconds(5))
    
    val lines = ssc.textFileStream("hdfs://spark1:9000/wordcount_dir")  
    val words = lines.flatMap { _.split(" ") }  
    val pairs = words.map { word => (word, 1) }  
    val wordCounts = pairs.reduceByKey(_ + _)  
    
    wordCounts.print()  
    
    ssc.start()
    ssc.awaitTermination()
  }
  
}

阅读更多
想对作者说点什么?

博主推荐

换一批

没有更多推荐了,返回首页