sparkstreaming整合kafka
-
完成大致的流式计算的架构
-
离线
flume--->hdfs--->mapreduce/hive--->sqoop--->hbase/mysql/redis/hdfs
-
流式
flume--->kafka--->storm/sparkstreaming--->mysql/redis/hbase
-
-
以下的案例基于flume—>kafka端数据是从kafka中获取的所以想要完成下面的程序,必须启动kafka和flume,或者自己在kafka的生产者中添加数据
-
基于receiver的 sparkstreaming的整合
package com.auar.ssk import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} /** * sparkStreaming整合kafka的第一种方式基于receiver * 就是kafka发送个数据然后会产生回执 * 这样的情况有三种:1.至少一次:这样会造成数据的重复 * 2.发送一次:这样会造成数据的丢失 * 3.有且仅有一次:这样就会造成效率过慢 * */ object SSK01 { def main(args: Array[String]): Unit = { System.setProperty("HADOOP_USER_NAME","root") val checkPoint="hdfs://hadoop:9000/ssk" val sparkConf = new SparkConf().setMaster("local[2]").setAppName("SSK") val streamingContext = new StreamingContext(sparkConf,Seconds(4)) streamingContext.checkpoint(checkPoint) streamingContext.sparkContext.setLogLevel("WARN") val conf=Map("first"-> 2); //直接从kafka读取出来的值是key-value的形式 val kafkaDStream: DStream[String] = KafkaUtils.createStream(streamingContext, "hadoop", "ssk", conf, StorageLevel.MEMORY_AND_DISK_SER).map(_._2) val result: DStream[(String, Int)] = kafkaDStream.flatMap(_.split(" ")).map(x => (x, 1)) .updateStateByKey( (newValue: Seq[Int], state: Option[Int]) => { Some(newValue.sum + state.getOrElse(0)) }) result.print() streamingContext.start() streamingContext.awaitTermination() } }
-
基于direct的sparkstreaming的整合
-
这里整合direct模式就是自己从kafka中拉取数据而不是被动的接收
package com.auar.ssk import kafka.serializer.StringDecoder import org.apache.spark.SparkConf import org.apache.spark.streaming.dstream.{DStream, InputDStream} import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} object SSK02 { def main(args: Array[String]): Unit = { System.setProperty("HADOOP_USER_NAME","root") val checkPoint="hdfs://hadoop:9000/ssk02" val sparkConf = new SparkConf().setMaster("local[2]").setAppName("SSK") val streamingContext = new StreamingContext(sparkConf,Seconds(4)) streamingContext.checkpoint(checkPoint) streamingContext.sparkContext.setLogLevel("WARN") val kafkaParams: Map[String, String] = Map("bootstrap.servers" -> "hadoop:9092") val kafkaSteams: DStream[String] = KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder](streamingContext ,kafkaParams,Set("first")).map(_._2) val result: DStream[(String, Int)] = kafkaSteams.flatMap(_.split(" ")).map(x => (x, 1)) .updateStateByKey( (newValue: Seq[Int], state: Option[Int]) => { Some(newValue.sum + state.getOrElse(0)) }) result.print() streamingContext.start() streamingContext.awaitTermination() } }