package cn.spark.study.streaming;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.KafkaUtils;
import scala.Tuple2;
/**
* 基于Kafka Direct方式的实时wordcount程序
* @author Administrator
*
*/
public class KafkaDurectWordCount {
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setMaster("local[2]")
.setAppName("KafkaDirectWordCount");
JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));
// 首先,要创建一份kafka参数map
Map<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("metadata.broker.list",
"192.168.1.107:9092,192.168.1.108:9092,192.168.1.109:9092");
// 然后,要创建一个set,里面放入,你要读取的topic
// 这个,就是我们所说的,它自己给你做的很好,可以并行读取多个topic
Set<String> topics = new HashSet<String>();
topics.add("WordCount");
// 创建输入DStream
JavaPairInputDStream<String, String> lines = KafkaUtils.createDirectStream(
jssc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
kafkaParams,
topics);
// 执行wordcount操作
JavaDStream<String> words = lines.flatMap(x-> Arrays.asList(x._2.split(" ")).iterator());
JavaPairDStream<String, Integer> pairs = words.mapToPair(x -> new Tuple2(x, 1));
JavaPairDStream<String, Integer> wordCounts = pairs.reduceByKey((i1,i2) -> (i1+i2));
wordCounts.print();
jssc.start();
jssc.awaitTermination();
jssc.close();
}
}
基于Kafka Direct方式的实时wordcount程序
最新推荐文章于 2021-05-07 09:00:26 发布