一、概述
- SparkStreaming以Kafka作为数据源,手动管理offsets保存在zookeeper中。
- SparkStreamingOnKafka(入口)。
- KafkaZookeeperUtils(获取数据和更新offsets工具类)。
二、代码
1、SparkStreamingOnKafka
package com.cfl.spark.streaming;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import com.cfl.spark.streaming.KafkaZookeeperUtils.DataCallBack;
/**
* 从kafka中获取数据
* @author chenfenli
*
*/
public class SparkStreamingOnKafka {
private static String kafkaServer = "192.168.1.103:9092";
private static String zkServer = "192.168.1.103:2181";
private static String groupId = "test6";
private static String topic = "t0407";
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf();
sparkConf.setAppName("SparkStreamingOnKafka");
sparkConf.setMaster("local");
JavaStreamingContext context = new JavaStreamingContext(sparkConf,Durations.seconds(5));
KafkaZookeeperUtils.getData(context, kafkaServer, zkServer, groupId, topic, new DataCallBack() {
@Override
public boolean data(List<String> lines) {
// 返回 true:更新offsets false:不更新offsets
try {
for(String line : lines) {
System.out.println(line);
}
return true;
} catch (Exception e) {
System.out.println(e);
return false;
}
}
});
context.start();
context.awaitTermination();
context.stop();
}
}
2、KafkaZookeeperUtils
package com.cfl.spark.streaming;
import java.io.Serializable;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicReference;
import org.I0Itec.zkclient.ZkClient;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.HasOffsetRanges;
import org.apache.spark.streaming.kafka.KafkaUtils;
import org.apache.spark.streaming.kafka.OffsetRange;
import kafka.common.TopicAndPartition;
import kafka.message.MessageAndMetadata;
import kafka.serializer.StringDecoder;
import kafka.utils.ZKGroupTopicDirs;
import kafka.utils.ZkUtils;
import scala.Tuple2;
public class KafkaZookeeperUtils implements Serializable{
private static final long serialVersionUID = 1L;
public static void getData(JavaStreamingContext context, String kafkaServer, String zkServer, String groupId, String topic, DataCallBack dataCallBack) {
final AtomicReference<OffsetRange[]> offsetRanges = new AtomicReference<>();
Map<String, String> kafkaParameters = new HashMap<>();
kafkaParameters.put("metadata.broker.list", kafkaServer);
HashSet<String> topis = new HashSet<>();
topis.add(topic);
// 获取当前offsets
ZKGroupTopicDirs zgt=new ZKGroupTopicDirs(groupId,topic);
final String zkTopicPath=zgt.consumerOffsetDir();
ZkClient zkClient=new ZkClient(zkServer);
int countChildren=zkClient.countChildren(zkTopicPath);
Map<TopicAndPartition,Long> fromOffsets=new HashMap<>();
for (int i = 0; i < countChildren; i++) {
String path=zkTopicPath+"/"+i;
String offset=zkClient.readData(path);
TopicAndPartition topicAndPartition=new TopicAndPartition(topic,i);
fromOffsets.put(topicAndPartition,Long.parseLong(offset));
}
zkClient.close();
// 创建链接对象
if(fromOffsets.size() > 0) {
// 非第一次消费
JavaInputDStream<String> inputDStream = KafkaUtils.createDirectStream(context, String.class, String.class, StringDecoder.class, StringDecoder.class, String.class, kafkaParameters, fromOffsets, new Function<MessageAndMetadata<String, String>, String>() {
private static final long serialVersionUID = 1L;
@Override
public String call(MessageAndMetadata<String, String> arg0) throws Exception {
return arg0.message();
}
});
JavaDStream<String> dStream = inputDStream.transform(new Function<JavaRDD<String>, JavaRDD<String>>() {
private static final long serialVersionUID = 1L;
@Override
public JavaRDD<String> call(JavaRDD<String> arg0) throws Exception {
OffsetRange[] offsets = ((HasOffsetRanges) arg0.rdd()).offsetRanges();
offsetRanges.set(offsets);
return arg0;
}
});
dStream.foreachRDD(new VoidFunction<JavaRDD<String>>() {
private static final long serialVersionUID = 1L;
@Override
public void call(JavaRDD<String> arg0) throws Exception {
// 这里业务
boolean flag = dataCallBack.data(arg0.collect());
// 更新offsets
if(flag) {
ZkClient zkClient2 = new ZkClient(zkServer);
OffsetRange[] offsets = offsetRanges.get();
if (null != offsets) {
for (OffsetRange o : offsets) {
String zkPath = zkTopicPath + "/" + o.partition();
ZkUtils.updatePersistentPath(zkClient2, zkPath, o.untilOffset() + "");
}
}
zkClient2.close();
}
}
});
} else {
// 第一次消费: 从最大偏移量开始消费,如果想从第一条数据开始消费,需手动初始化fromOffsets,调用非第一次消费的方法
KafkaUtils.createDirectStream(context, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParameters, topis)
.transformToPair(new Function<JavaPairRDD<String,String>, JavaPairRDD<String, String>>() {
private static final long serialVersionUID = 1L;
@Override
public JavaPairRDD<String, String> call(JavaPairRDD<String, String> arg0) throws Exception {
OffsetRange[] offsets = ((HasOffsetRanges) arg0.rdd()).offsetRanges();
offsetRanges.set(offsets);
return arg0;
}
})
.map(new Function<Tuple2<String,String>, String>() {
private static final long serialVersionUID = 1L;
@Override
public String call(Tuple2<String, String> arg0) throws Exception {
return arg0._2;
}
})
.foreachRDD(new VoidFunction<JavaRDD<String>>() {
private static final long serialVersionUID = 1L;
@Override
public void call(JavaRDD<String> arg0) throws Exception {
// 这里业务
boolean flag = dataCallBack.data(arg0.collect());
// 更新offsets
if(flag) {
ZkClient zkClient2 = new ZkClient(zkServer);
OffsetRange[] offsets = offsetRanges.get();
if (null != offsets) {
for (OffsetRange o : offsets) {
String zkPath = zkTopicPath + "/" + o.partition();
ZkUtils.updatePersistentPath(zkClient2, zkPath, o.untilOffset() + "");
}
}
zkClient2.close();
}
}
});
}
}
interface DataCallBack {
boolean data(List<String> lines);
}
}