createStream 只有一个executors 分多个(可配置)receiver 从kafka 拉数据,然后分发给其他executor执行。 这点通过thread dump得到论证。
createDirectStream 每个executors都会从Kafka拉数据,每个executor 从kafka的一个分区拉数据。这点通过在kafka单个节点上执行iftop -n -i em1可以看到。thread dump中没发现。
createStream==========================================================================================
SparkConf sparkConf = new SparkConf().setAppName(sparkAppName).setMaster(master);
JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(duration));
Map<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("group.id", groupId);
kafkaParams.put("metadata.broker.list", metadataBrokerList);
kafkaParams.put("auto.offset.reset", autoOffsetReset);
Map<String, Integer> topicMap = new HashMap<String, Integer>();
String[] topicsArr = topicStr.split(",");
int n = topicsArr.length;
for (int i = 0; i < n; i++) {
topicMap.put(topicsArr[i], ConfigMgr.getIntByKey("spark.thread.num"));
}
JavaPairReceiverInputDStream<String, String> lines = KafkaUtils.createStream(jssc, zookeeper,
ConfigMgr.getKakfaGroupId(), topicMap);
VoidFunction<JavaPairRDD<String, String>> func = new VoidFunction<JavaPairRDD<String, String>>() {
private static final long serialVersionUID = -7821297251721419326L;
private Logger logger = LoggerFactory.getLogger(VoidFunction.class);
@Override
public void call(JavaPairRDD<String, String> arg0) throws Exception {
try {
arg0.foreach(new VoidFunction<Tuple2<String, String>>() {
private static final long serialVersionUID = -8745159565584246451L;
@Override
public void call(Tuple2<String, String> arg0) throws Exception {
try {
execute(arg0._2);
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
}
});
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
}
};
lines.foreachRDD(func);
jssc.start();
jssc.awaitTermination();
createDirectStream==========================================================================================
SparkConf sparkConf = new SparkConf().setAppName(sparkAppName).setMaster(master);
JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(duration));
Set<String> topicSet = new HashSet<String>();
for (String topic : topicStr.split(",")) {
topicSet.add(topic);
}
Map<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("group.id", groupId);
kafkaParams.put("metadata.broker.list", metadataBrokerList);
kafkaParams.put("auto.offset.reset", autoOffsetReset);
JavaPairInputDStream<String, String> pairInput = KafkaUtils.createDirectStream(jssc, String.class, String.class,
StringDecoder.class, StringDecoder.class, kafkaParams, topicSet);
VoidFunction<JavaPairRDD<String, String>> func = new VoidFunction<JavaPairRDD<String, String>>() {
private static final long serialVersionUID = -7821297251721419326L;
private Logger logger = LoggerFactory.getLogger(VoidFunction.class);
@Override
public void call(JavaPairRDD<String, String> arg0) throws Exception {
try {
arg0.foreach(new VoidFunction<Tuple2<String, String>>() {
private static final long serialVersionUID = -8745159565584246451L;
@Override
public void call(Tuple2<String, String> arg0) throws Exception {
try {
System.out.println(arg0._2);
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
}
});
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
}
};
pairInput.foreachRDD(func);
jssc.start();
jssc.awaitTermination();
createDirectStream + custom offset==========================================================================================
SparkConf sparkConf = new SparkConf().setAppName(sparkAppName).setMaster(master);
JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(duration));
Set<String> topicSet = new HashSet<String>();
for (String topic : topicStr.split(",")) {
topicSet.add(topic);
}
Map<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("group.id", groupId);
kafkaParams.put("metadata.broker.list", metadataBrokerList);
kafkaParams.put("auto.offset.reset", autoOffsetReset);
long[] off = new long[] { 3316538, 2767422, 3332371, 3330540, 3863203, 3315774, 3867953, 3328188, 3325543,
3892565 };
Map<TopicAndPartition, Long> fromOffsets = new HashMap<TopicAndPartition, Long>();
for (int i = 0; i < 10; i++) {
fromOffsets.put(new TopicAndPartition(topicStr, i), off[i]);
}
JavaInputDStream<String> jid = KafkaUtils.createDirectStream(jssc, String.class, String.class,
StringDecoder.class, StringDecoder.class, String.class, kafkaParams, fromOffsets,
new Function<kafka.message.MessageAndMetadata<String, String>, String>() {
private static final long serialVersionUID = -6590667828252772663L;
@Override
public String call(MessageAndMetadata<String, String> arg0) throws Exception {
return arg0.message();
}
});
final VoidFunction<String> func0 = new VoidFunction<String>() {
private static final long serialVersionUID = -2520206838533422786L;
@Override
public void call(String arg0) throws Exception {
tugBoat.execute(JSONObject.parseObject(arg0));
}
};
VoidFunction<JavaRDD<String>> func = new VoidFunction<JavaRDD<String>>() {
private static final long serialVersionUID = 7679681553001908774L;
@Override
public void call(JavaRDD<String> arg0) throws Exception {
arg0.foreach(func0);
}
};
jid.foreachRDD(func);
jssc.start();
jssc.awaitTermination();
createDirectStream 每个executors都会从Kafka拉数据,每个executor 从kafka的一个分区拉数据。这点通过在kafka单个节点上执行iftop -n -i em1可以看到。thread dump中没发现。
createStream==========================================================================================
SparkConf sparkConf = new SparkConf().setAppName(sparkAppName).setMaster(master);
JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(duration));
Map<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("group.id", groupId);
kafkaParams.put("metadata.broker.list", metadataBrokerList);
kafkaParams.put("auto.offset.reset", autoOffsetReset);
Map<String, Integer> topicMap = new HashMap<String, Integer>();
String[] topicsArr = topicStr.split(",");
int n = topicsArr.length;
for (int i = 0; i < n; i++) {
topicMap.put(topicsArr[i], ConfigMgr.getIntByKey("spark.thread.num"));
}
JavaPairReceiverInputDStream<String, String> lines = KafkaUtils.createStream(jssc, zookeeper,
ConfigMgr.getKakfaGroupId(), topicMap);
VoidFunction<JavaPairRDD<String, String>> func = new VoidFunction<JavaPairRDD<String, String>>() {
private static final long serialVersionUID = -7821297251721419326L;
private Logger logger = LoggerFactory.getLogger(VoidFunction.class);
@Override
public void call(JavaPairRDD<String, String> arg0) throws Exception {
try {
arg0.foreach(new VoidFunction<Tuple2<String, String>>() {
private static final long serialVersionUID = -8745159565584246451L;
@Override
public void call(Tuple2<String, String> arg0) throws Exception {
try {
execute(arg0._2);
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
}
});
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
}
};
lines.foreachRDD(func);
jssc.start();
jssc.awaitTermination();
createDirectStream==========================================================================================
SparkConf sparkConf = new SparkConf().setAppName(sparkAppName).setMaster(master);
JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(duration));
Set<String> topicSet = new HashSet<String>();
for (String topic : topicStr.split(",")) {
topicSet.add(topic);
}
Map<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("group.id", groupId);
kafkaParams.put("metadata.broker.list", metadataBrokerList);
kafkaParams.put("auto.offset.reset", autoOffsetReset);
JavaPairInputDStream<String, String> pairInput = KafkaUtils.createDirectStream(jssc, String.class, String.class,
StringDecoder.class, StringDecoder.class, kafkaParams, topicSet);
VoidFunction<JavaPairRDD<String, String>> func = new VoidFunction<JavaPairRDD<String, String>>() {
private static final long serialVersionUID = -7821297251721419326L;
private Logger logger = LoggerFactory.getLogger(VoidFunction.class);
@Override
public void call(JavaPairRDD<String, String> arg0) throws Exception {
try {
arg0.foreach(new VoidFunction<Tuple2<String, String>>() {
private static final long serialVersionUID = -8745159565584246451L;
@Override
public void call(Tuple2<String, String> arg0) throws Exception {
try {
System.out.println(arg0._2);
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
}
});
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
}
};
pairInput.foreachRDD(func);
jssc.start();
jssc.awaitTermination();
createDirectStream + custom offset==========================================================================================
SparkConf sparkConf = new SparkConf().setAppName(sparkAppName).setMaster(master);
JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(duration));
Set<String> topicSet = new HashSet<String>();
for (String topic : topicStr.split(",")) {
topicSet.add(topic);
}
Map<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("group.id", groupId);
kafkaParams.put("metadata.broker.list", metadataBrokerList);
kafkaParams.put("auto.offset.reset", autoOffsetReset);
long[] off = new long[] { 3316538, 2767422, 3332371, 3330540, 3863203, 3315774, 3867953, 3328188, 3325543,
3892565 };
Map<TopicAndPartition, Long> fromOffsets = new HashMap<TopicAndPartition, Long>();
for (int i = 0; i < 10; i++) {
fromOffsets.put(new TopicAndPartition(topicStr, i), off[i]);
}
JavaInputDStream<String> jid = KafkaUtils.createDirectStream(jssc, String.class, String.class,
StringDecoder.class, StringDecoder.class, String.class, kafkaParams, fromOffsets,
new Function<kafka.message.MessageAndMetadata<String, String>, String>() {
private static final long serialVersionUID = -6590667828252772663L;
@Override
public String call(MessageAndMetadata<String, String> arg0) throws Exception {
return arg0.message();
}
});
final VoidFunction<String> func0 = new VoidFunction<String>() {
private static final long serialVersionUID = -2520206838533422786L;
@Override
public void call(String arg0) throws Exception {
tugBoat.execute(JSONObject.parseObject(arg0));
}
};
VoidFunction<JavaRDD<String>> func = new VoidFunction<JavaRDD<String>>() {
private static final long serialVersionUID = 7679681553001908774L;
@Override
public void call(JavaRDD<String> arg0) throws Exception {
arg0.foreach(func0);
}
};
jid.foreachRDD(func);
jssc.start();
jssc.awaitTermination();