流式数据处理(stream processing)
要处理的数据就像流水一样,源源不断的产生数据,需要实时进行处理
对SparkCore的高级API的封装,将流式的数据切分为小的批次batch(按照时间间隔)的数据,然后使用SparkCore进行处理
DStream集合,List
StreamingContext:上下文对象,从实时流式数据源中接收数据
底层还是SparkContext
代码实现:案例WordCount
object StreamingWordPrint{
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[2]").setAppName("B_StreamingWordPrint")
val ssc = new StreamingContext(conf, Seconds(1))
ssc.sparkContext.setLogLevel("WARN")
val lines = ssc.socketTextStream("bigdata-hpsk01.ares.com", 9999)
val words = lines.flatMap(_.split(" "))
val pairs = words.map(word => (word, 1))
val wordCounts = pairs.reduceByKey(_ + _)
wordCounts.foreachRDD((rdd,time)=>{
println("-----------------------------------")
val batchTime = time.milliseconds
val sdf = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss")
val batchDataTime = sdf.format(new Date(batchTime))
println(s"Time:${batchDataTime}")
println("-----------------------------------")
if(!rdd.isEmpty()){
println("=============================")
rdd.coalesce(1).foreachPartition(_.foreach(println))
}
})
wordCounts.foreachRDD(rdd =>{
if(!rdd.isEmpty()){
rdd.coalesce(1).foreachPartition(iter =>{
//1.获取连接
Class.forName("com.mysql.jdbc.Driver")
val url = "jdbc:mysql://bigdata-hpsk01.ares.com/test"
val userName = "root"
val password = "123456"
var conn: Connection = null
try{
conn = DriverManager.getConnection(url,userName,password)
val pst = conn.prepareStatement("INSERT INTO tb_result_Streaming(k,v) VALUES(?,?)")
iter.foreach{
case(k,v)=>{
println(s"k = ${k},v = ${v}")
pst.setString(1,k)
pst.setInt(2,v)
pst.executeUpdate()
}}
}catch {
case e:Exception =>e.printStackTrace()
}finally {
if (conn !=null) conn.close()
}
})
}
})
wordCounts.print()
ssc.start() // Start the computation
ssc.awaitTermination() // Wait for the computation to terminate
}
}