代码如下:
package com.spark
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
object SparkStreamHDFS {
def main(args: Array[String]): Unit = {
val sparkConf =new SparkConf().setMaster("local[*]").setAppName("SparkStream").set("spark.testing.memory", "2147480000")
val sc= new SparkContext(sparkConf)
val ssc = new StreamingContext(sc,Seconds(10))
val fileStreamLine: DStream[String] = ssc.textFileStream("hdfs://****/user/hive/warehouse/dsc_ods.db/spark_data")
val dStream: DStream[String] = fileStreamLine.flatMap(line => line.split("\\| "))
val map: DStream[(String, Int)] = dStream.map((_,1))
val key: DStream[(String, Int)] = map.reduceByKey(_+_)
key.print()
ssc.start()
ssc.awaitTermination()
}
}
无论如何执行,都无法读取到hdfs上的数据,经过排查以及网上资料搜索,本地集群时间不同步导致该问题,调整时间同步后解决。