一、SparkStreaming读取Kafka的两种模式:
1、Receiver(实时读取)
通过zookeeper来连接kafka队列,使用Kafka的高层次Consumer API来实现的。不过这种方式是先把数据从kafka中读取出来,然后缓存在内存,再定时处理。如果这时候集群退出,而偏移量又没处理好的话,数据就丢掉了,存在程序失败丢失数据的可能。1.2之后引入spark.streaming.receiver.writeAheadLog.enable以规避此风险。
2、Direct(定时批量读取)
直接连接到kafka的节点上获取数据,周期性地查询Kafka,来获得每个topic+partition的最新的offset,从而定义每个batch的offset的范围。当处理数据的job启动时,就会使用Kafka的简单consumer api来获取Kafka指定offset范围的数据。
直接读取方式相对传统Receiver方式的有点:简化并行,高效,精确一次。
二、案例演示(只演示Direct模式)
1、一个简单的演示:
package TestExamples
import kafka.serializer.StringDecoder
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka.KafkaUtils
/**
* 与kafka整合
*/
object TestSparkStreaming5 {
def main(args: Array[String]): Unit = {
//程序入口 -》 跟生产对接了
val conf = new SparkConf().setMaster("local[2]").setAppName(s"${this.getClass.getSimpleName}")
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc,Seconds(2))
//数据源
/**
createDirectStream[K, V, KD <: Decoder[K], VD <: Decoder[V]]
(ssc: StreamingContext,
kafkaParams: Map[String, String],
topics: Set[String])
*/
val kafkaParams=Map("metadata.broker.list" -> "hadoop1:9092")
// kafkaParams.getOrElse()
// kafkaParams.get("")
val topics=Set("mytopic")
//k,v => k关于kafka的元数据的信息(主题,偏移量) v: kafka里面的数据
val dStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
.map(_._2)
dStream.flatMap(_.split(","))
.map((_,1))
.reduceByKey(_+_)
.print()
ssc.start()
ssc.awaitTermination()
}
}
2、Kafka多线程代码演示(用并行的方法消费kafka的数据),有两种方法,没有使用线程池和使用线程池。
(1)没有使用线程池
package TestExamples.SparkStreaming_Kafka;
import kafka.producer.KeyedMessage;
import kafka.javaapi.producer.Producer;
import kafka.producer.ProducerConfig;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
/**
* 不使用线程池的生产者:
*/
public class ProducerDemo {
public static void main(String[] args) {
//props用户保存一下配置信息的
Properties props = new Properties();
//添加配置信息:metadata.broker.list指定kafka的Borker的地址和端口,可以是多个Borker的地址
props.put("metadata.broker.list", "hadoop1:9092");
//数据写入到kafka中的是用的序列化方式
props.put("serializer.class", "kafka.serializer.StringEncoder");
//通过props创建一个ProducerConfig
ProducerConfig config = new ProducerConfig(props);
//创建一个Producer
Producer<String, String> producer = new Producer<String, String>(config);
for(int j=0 ; j < 3;j++){
List<KeyedMessage<String, String>> messageList = new ArrayList<KeyedMessage<String, String>>();
for (int i =200; i <= 300; i++){
KeyedMessage<String, String> message =
new KeyedMessage<String, String>("hahaha", j+"","producer-parition" + j+"->"+i);
System.out.println("producer-parition" + j+"->"+i);
messageList.add(message);
}
producer.send(messageList);
}
}
}
package TestExamples.SparkStreaming_Kafka;
import kafka.consumer.Consumer;
import kafka.consumer.ConsumerConfig;
import kafka.consumer.KafkaStream;
import kafka.javaapi.consumer.ConsumerConnector;
import kafka.message.MessageAndMetadata;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
/**
* 不使用线程池的消费者:每次遍历都创建一个线程
*/
public class ConsumerDemo {
private static final String topic="hahaha";
private static final Integer threads=3;
public static void main(String[] args) {
Properties properties = new Properties();
properties.put("zookeeper.connect","hadoop1:2181,hadoop1:2182,hadoop1:2183");
//指定一个组id
properties.put("group.id", "1706");
ConsumerConfig config = new ConsumerConfig(properties);
ConsumerConnector consumer = Consumer.createJavaConsumerConnector(config);
HashMap<String, Integer> topicCountMap = new HashMap<>();
topicCountMap.put(topic,threads);
Map<String, List<KafkaStream<byte[], byte[]>>> consumerMap = consumer.createMessageStreams(topicCountMap);
List<KafkaStream<byte[], byte[]>> streams = consumerMap.get(topic);
for(final KafkaStream<byte[], byte[]> kafkaStream : streams){//有几个线程就会有几个结果,一般有几个分区就会设置几个线程
new Thread(new Runnable() {
@Override
public void run() {
for(MessageAndMetadata<byte[], byte[]> mm : kafkaStream){
String msg = new String(mm.message());
final int partition = mm.partition();
System.out.println(Thread.currentThread().getId()+"分区号"+partition + " 信息 "+ msg + "偏移量"+mm.offset());
}
}
}).start();
}
}
}
(2)使用线程池
package TestExamples.SparkStreaming_Kafka;
import kafka.consumer.ConsumerConfig;
import kafka.consumer.KafkaStream;
import kafka.javaapi.consumer.ConsumerConnector;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* 线程池
*/
public class ThreadPoolTest implements Runnable {
private ConsumerConfig consumerConfig;
private static String topic="hahaha";
Properties props;
final int a_numThreads = 3;
public ThreadPoolTest() {
props = new Properties();
props.put("zookeeper.connect", "hadoop1:2181,hadoop1:2182,hadoop1:2183");
props.put("group.id", "1707");
props.put("zookeeper.session.timeout.ms", "400");
props.put("auto.commit.interval.ms", "1000");
consumerConfig = new ConsumerConfig(props);
}
@Override
public void run() {
Map<String, Integer> topicCountMap = new HashMap<String, Integer>();
topicCountMap.put(topic, new Integer(a_numThreads));
ConsumerConfig consumerConfig = new ConsumerConfig(props);
ConsumerConnector consumer = kafka.consumer.Consumer.createJavaConsumerConnector(consumerConfig);
Map<String, List<KafkaStream<byte[], byte[]>>> consumerMap = consumer.createMessageStreams(topicCountMap);
List<KafkaStream<byte[], byte[]>> streams = consumerMap.get(topic);
ExecutorService executor = Executors.newFixedThreadPool(a_numThreads);
for (final KafkaStream stream : streams) {
executor.submit(new KafkaConsumerThread(stream));
}
}
public static void main(String[] args) { //测试代码
System.out.println(topic);
Thread t = new Thread(new ThreadPoolTest());
t.start();
}
}
package TestExamples.SparkStreaming_Kafka;
import kafka.consumer.ConsumerIterator;
import kafka.consumer.KafkaStream;
import kafka.message.MessageAndMetadata;
/**
* Created by Administrator on 2017/9/17.
*/
public class KafkaConsumerThread implements Runnable{
private KafkaStream<byte[], byte[]> stream;
public KafkaConsumerThread(KafkaStream<byte[], byte[]> stream) {
this.stream = stream;
}
@Override
public void run() {
ConsumerIterator<byte[], byte[]> it = stream.iterator();
while (it.hasNext()) {
MessageAndMetadata<byte[], byte[]> mam = it.next();
System.out.println(Thread.currentThread().getName() + ": partition[" + mam.partition() + "],"
+ "offset[" + mam.offset() + "], " + new String(mam.message()));
}
}
}
三、控制offset让kafka不丢数据
package TestExamples.SparkStreaming_Kafka
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import kafka.utils.{ZKGroupTopicDirs, ZkUtils}
import org.I0Itec.zkclient.ZkClient
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* 不丢数据【通过设置偏移量】
*/
object KafkaDirectStream {
def main(args: Array[String]): Unit = {
val group="1708"
val conf = new SparkConf().setAppName("KafkaDirectStream").setMaster("local[2]")
val ssc = new StreamingContext(conf,Seconds(5))
val topic="hahaha"
val brokerList="hadoop1:9092"
val zkQuorum="hadoop1:2181,hadoop1:2182,hadoop1:2183"
val topics = Set(topic)
//创建一个对象, 其实是指定往zk写入数据的目录,用于保存偏移量
val topicDirs = new ZKGroupTopicDirs(group,topic) // /1708/hahha/
//获取zk中的路径
val zkPath = topicDirs.consumerOffsetDir // /1708/hahha/
val kafkaPrams = Map(
"metadata.broker.list" -> brokerList,
"group.id" -> group
)
val zKClient = new ZkClient(zkQuorum)
//查看该路径下是否有子节点(不同的分区保存不同的offset)
val children = zKClient.countChildren(zkPath) // /1708/hahha/
/**
* /1708/hahha/
* /1708/hahha/0
* /1708/hahha/1
* /1708/hahha/2
*
*
*/
//如果 zookeeper 中有保存 offset,我们会利用这个 offset 作为 kafkaStream 的起始位置
var fromOffsets: Map[TopicAndPartition, Long] = Map()
var kafkaStream:InputDStream[(String, String)]=null
if(children > 0){
for(i <- 0 until children){
//获取分区里面的数据 也就是偏移量
val partitionOffset = zKClient.readData[String](s"${zkPath}/${i}")
val tp = TopicAndPartition(topic,i)
//aura/0 -> 1000 将不同partition对应的offset 增加到fromOffsets中
fromOffsets +=(tp -> partitionOffset.toLong)
}
/**
*
* [K, V, KD <: Decoder[K], VD <: Decoder[V], R]
* (
* ssc: StreamingContext,
* kafkaParams: Map[String, String],
* fromOffsets: Map[TopicAndPartition, Long],
* messageHandler: (MessageAndMetadata[K, V]) ⇒ R
* )
*/
//这个会将 kafka 的消息进行 transform,最终 kafak 的数据都会变成 (topic_name, message) 这样的 tuple
val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.topic, mmd.message())
kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](
ssc, kafkaPrams, fromOffsets, messageHandler)
kafkaStream
}else{
//如果未保存,根据 kafkaParam 的配置使用最新或者最旧的 offset
kafkaStream=KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaPrams, topics)
}
var offsetRanges = Array[OffsetRange]()
kafkaStream.transform( rdd =>{
offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd
}).map( msg => msg._2)
.foreachRDD( rdd => {
rdd.foreachPartition( parition =>{
parition.foreach( recoder =>{
println(recoder)
})
})
for( o <- offsetRanges){
val newZkPath = s"${zkPath}/${o.partition}"
//将该 partition 的 offset 保存到 zookeeper
ZkUtils.updatePersistentPath(zKClient, newZkPath, o.fromOffset.toString)
}
})
ssc.start()
ssc.awaitTermination()
ssc.stop()
}
}