Spark读取kafka数据实时单词计数(WordCount)
依赖:
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.12</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.12</artifactId>
<version>3.1.2</version>
</dependency>
</dependencies>
导包:
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.Optional;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.ConsumerStrategies;
import org.apache.spark.streaming.kafka010.KafkaUtils;
import org.apache.spark.streaming.kafka010.LocationStrategies;
import scala.Tuple2;
import java.util.*;
Java代码:
public class DataFromKafka {
public static void main(String[] args) throws InterruptedException {
SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("kafka");
JavaStreamingContext ssc = new JavaStreamingContext(conf, Duration.apply(5000));
//设置日志级别更好观看结果
ssc.sparkContext().setLogLevel("WARN");
HashMap<String, Object> map = new HashMap<>();
map.put("bootstrap.servers","localhost:9092");
map.put("key.deserializer", StringDeserializer.class);
map.put("value.deserializer",StringDeserializer.class);
map.put("group.id","test");
map.put("auto.offset.reset","latest");
map.put("enable.auto.commit",Boolean.FALSE);
//构建topic集合
Collection<String> topics = Arrays.asList("kafka-test");
JavaInputDStream<ConsumerRecord<String, String>> directStream = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent(),
ConsumerStrategies.<String, String>Subscribe(topics, map));
JavaDStream<String> stream1 = directStream.flatMap(new FlatMapFunction<ConsumerRecord<String, String>, String>() {
@Override
public Iterator<String> call(ConsumerRecord<String, String> s) throws Exception {
return Arrays.asList(s.value().toString().split(" ")).iterator();
}
});
JavaPairDStream<String, Integer> stream2 = stream1.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<>(s, 1);
}
});
/** 如果要使用updateStateByKey算子,就必须设置一个checkpoint目录,开启checkpoint机制
* 这样的话才能把每个key对应的state除了在内存中有,那么也要checkpoint一份
* 因为你要长期保存一份key的state的话,那么spark streaming是要求必须用checkpoint的,
* 以便于在内存数据丢失的时候,可以从checkpoint中恢复数据
**/
//这里直接在本地项目下建了一个datas文件夹用来记录数据
ssc.checkpoint("datas");
//这里不用reduceByKey,因为reduceByKey不能将结果累加起来,所以使用updateStateByKey,而使用updateStateByKey的前提是设置checkpoint目录
JavaPairDStream<String, Integer> stream3 = stream2.updateStateByKey(new Function2<List<Integer>, Optional<Integer>, Optional<Integer>>() {
@Override
public Optional<Integer> call(List<Integer> values, Optional<Integer> state) throws Exception {
Integer newValue = 0;
if (state.isPresent()) {
newValue = state.get();
}
for (Integer value : values) {
newValue += value;
}
return Optional.of(newValue);
}
});
stream3.print();
ssc.start();
ssc.awaitTermination();
}
}