package com.pyrrha.examples;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import kafka.serializer.StringDecoder;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.Optional;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import scala.Tuple2;
public class updateStateByKey {
private static final String KAFKA_TOPIC = "TopicA";
public static void main(String[] args) throws Exception {
System.setProperty("hadoop.home.dir", "D:\\checkpoint\\hadoop-common-2.2.0-bin-master");
SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("WordsCount");
JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.milliseconds(2000));
Map<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("bootstrap.servers", "127.0.0.1:9092");
kafkaParams.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
kafkaParams.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
kafkaParams.put("group.id", "lingroup");
//kafkaParams.put("auto.offset.reset", "latest");
Set<String> topics = new HashSet<String>();
topics.add(KAFKA_TOPIC);
JavaPairInputDStream<String, String> stream = org.apache.spark.streaming.kafka.KafkaUtils.
createDirectStream(jssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topics);
JavaPairDStream<String, Integer> transDStream = stream.flatMap(new FlatMapFunction<Tuple2<String,String>, String>() {
public Iterator<String> call(Tuple2<String, String> t) throws Exception {
return Arrays.asList(t._2.split(" ")).iterator();
}
}).filter(new Function<String, Boolean>() {
public Boolean call(String v1) throws Exception {
return v1.equals("a") ? false :true;
}
}).mapToPair(new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(String t) throws Exception {
return new Tuple2<String, Integer>(t, 1);
}
}).reduceByKey(new Function2<Integer, Integer, Integer>() {
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
}).window(new Duration(6000), new Duration(6000));
/**
* @param v1表示从上面获取的transDStream的所有value的List集合(如果transDStream=["b":2,"c":1],那么v1=[2,1])
* @param v2表示上一个updateStateByKey的每个value保存的值
* 所以,v1就是每次最新batch处理后的value集合,v2就是上个batch处理后缓存的value值
*/
transDStream.updateStateByKey(new Function2<List<Integer>, Optional<Integer>, Optional<Integer>>() {
public Optional<Integer> call(List<Integer> v1, Optional<Integer> v2) throws Exception {
Integer v3 = 0;
if(v2.isPresent())
v3 = v2.get();
for (Integer v : v1)
v3 += v;
return Optional.of(v3);
}
}).print();
jssc.checkpoint("file:///D:/checkpoint/");
jssc.start();
jssc.awaitTermination();
}
}