之前我们的SparkStreaming都是一些Socket的数据了,还有其他几种
文件流:
监控一个目录中文件变化,只要有新文件产生他就会读入
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.Seconds
import org.apache.log4j.Logger
import org.apache.log4j.Level
import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
object MyFileStreamingDemo {
def main(args: Array[String]): Unit = {
//不打印多余的日志配置
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
//获取StreamingContext对象
val conf=new SparkConf().setAppName("StreamTest2").setMaster("local[2]")
val ssc=new StreamingContext(conf,Seconds(3))
//得到DStream,通过监控一个文件目录的变化
val dstream = ssc.textFileStream("G:/msdownld.tmp")
dstream.print()
//启动流式计算
ssc.start()
ssc.awaitTermination()
}
}
RDD队列流:
import org.apache.spark.SparkConf
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.storage.StorageLevel
import org.apache.log4j.Logger
import org.apache.log4j.Level
import org.apache.spark.rdd.RDD
import scala.collection.mutable.Queue
object MyRDDQueueStreamDemo {
def main(args: Array[String]): Unit = {
//不打印多余的日志配置
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
//获取StreamingContext对象
val conf=new SparkConf().setAppName("StreamTest2").setMaster("local[2]")
val ssc=new StreamingContext(conf,Seconds(3))
//定义一个队列
val queue = new Queue[RDD[Int]]()
//初始化队列,1秒产生一个RDD,并把它放进队列中,等待sparkStreaming采样
for(i <- 1 to 3){
queue += ssc.sparkContext.makeRDD(1 to 5)
Thread.sleep(1000)
}
val inputStream = ssc.queueStream(queue)
inputStream.print()
//启动流式计算
ssc.start()
ssc.awaitTermination()
}
}