消费kafka数据

spark消费kafka数据

1.导入maven
<properties>
	<spark.version>2.1.1</spark.version>
</properties>
<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-core_2.11</artifactId>
    <version>${spark.version}</version>
</dependency>
<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-streaming_2.11</artifactId>
    <version>${spark.version}</version>
</dependency>
<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
    <version>${spark.version}</version>
</dependency>
2.创建kafkaconsumer.properties
bootstrap.servers=hadoop102:9092,hadoop103:9092,hadoop104:9092
group.id=spark_0615
key.deserializer=org.apache.kafka.common.serialization.StringDeserializer
value.deserializer=org.apache.kafka.common.serialization.StringDeserializer
auto.offset.reset=earlist
3.消费kafka数据
//创建SparkConf
val conf: SparkConf = new SparkConf().setAppName("KafkaTest").setMaster("local[*]")
//设置从kafka消费的速度
conf.set("spark.streaming.kafka.maxRatePerPartition", "100")
conf.set("spark.streaming.pressure.enable", "true")
conf.set("spark.streaming.stopGracefullyOnShutdown", "true")

//创建StreamingContext
val ssc = new StreamingContext(conf, Seconds(5))

//kafka配置信息
val kafkaPara: Map[String, String] = Map(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "hadoop102:9092,hadoop103:9092,hadoop104:9092",
  ConsumerConfig.GROUP_ID_CONFIG -> "bigdata",
  ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer",
  ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer")

//配置自定义topicPartition
val offsets = Map(new TopicPartition("first", 0) -> 1L)

val consumerStrategy: ConsumerStrategy[String, String] = ConsumerStrategies.Subscribe[String, String](List("first"), kafkaPara, offsets)

val inputDStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](ssc, LocationStrategies.PreferConsistent, consumerStrategy)

var offsetRanges: Array[OffsetRange] = Array.empty[OffsetRange]

val lineDStream: DStream[String] = inputDStream.transform(rdd => {
  offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
  rdd.map(x => x.value())
})

lineDStream.print()

offsetRanges.foreach(x => x.untilOffset)

ssc.start()
ssc.awaitTermination()

flink消费kafka数据

1.导入maven
<dependency>
    <groupId>org.apache.flink</groupId>
    <artifactId>flink-scala_2.11</artifactId>
    <version>1.7.2</version>
</dependency>
<dependency>
    <groupId>org.apache.flink</groupId>
    <artifactId>flink-streaming-scala_2.11</artifactId>
    <version>1.7.2</version>
</dependency>
<dependency>
    <groupId>org.apache.flink</groupId>
    <artifactId>flink-connector-kafka-0.11_2.11</artifactId>
    <version>1.7.2</version>
</dependency>
2.创建kafkaconsumer.properties
bootstrap.servers=hadoop102:9092,hadoop103:9092,hadoop104:9092
group.id=flink_0615
key.deserializer=org.apache.kafka.common.serialization.StringDeserializer
value.deserializer=org.apache.kafka.common.serialization.StringDeserializer
3.消费kafka的value数据
object FromKafkaSource {
    def main(args: Array[String]): Unit = {
        //创建上下文环境
        val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
        //获取参数配置
        val properties: Properties = ProPertiesUtil.getProperties("kafkaconsumer.properties")
        //添加source创建DataStream
        val lineDataStream: DataStream[String] = env.addSource(new FlinkKafkaConsumer011[String]("flink", new SimpleStringSchema(), properties))
        //获取wordCount
        val wordToCountDataStream: DataStream[(String, Int)] = lineDataStream.flatMap(_.split(" ")).map((_, 1)).keyBy(0).sum(1)
        //打印
        wordToCountDataStream.print()
        //执行
        env.execute()
    }
}
4.消费kafka的key-value数据
object FromKafkaSource {
    def main(args: Array[String]): Unit = {
        //创建上下文环境
        val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
        //获取参数配置
        val properties: Properties = ProPertiesUtil.getProperties("kafkaconsumer.properties")
        //添加source创建DataStream
        val keyToValueDataStream: DataStream[(String, String)] = env.addSource(new FlinkKafkaConsumer011[(String, String)]("flink", new KeyedDeserializationSchema[(String, String)] {
            override def isEndOfStream(nextElement: (String, String)): Boolean = {
                false
            }

            override def deserialize(messageKey: Array[Byte], message: Array[Byte], topic: String, partition: Int, offset: Long): (String, String) = {
                if (messageKey != null && message != null) {
                    val key = new String(messageKey, "UTF-8")
                    val value = new String(message, "UTF-8")
                    (key, value)
                } else {
                    //如果kafka中的数据为空返回一个固定的二元组
                    ("null", "null")
                }
            }

            override def getProducedType: TypeInformation[(String, String)] = {
                createTuple2TypeInformation(createTypeInformation[String], createTypeInformation[String])
            }
        }, properties))
        //打印
        keyToValueDataStream.print()
        //执行
        env.execute()
    }
}
发布了29 篇原创文章 · 获赞 0 · 访问量 540
展开阅读全文

没有更多推荐了,返回首页

©️2019 CSDN 皮肤主题: 深蓝海洋 设计师: CSDN官方博客

分享到微信朋友圈

×

扫一扫,手机浏览