package org.apache.spark.examples.streaming;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.Time;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaPairReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.KafkaUtils;
import scala.Tuple2;
import com.google.common.base.Optional;
import com.google.common.collect.Lists;
public class SparkStreamingFromFlumeToHBaseExample {
private static final Pattern SPACE = Pattern.compile(" ");
public static void main(String[] args) {
if (args.length == 0) {
System.err
.println("Usage: SparkStreamingFromFlumeToHBaseWindowingExample {master} {host} {port} {table} {columnFamily} {windowInSeconds} {slideInSeconds");
System.exit(1);
}
// String master = args[0];
// String host = args[1];
// int port = Integer.parseInt(args[2]);
String tableName = "test";// args[3];
String columnFamily = "f";// args[4];
// int windowInSeconds = 3;// Integer.parseInt(args[5]);
// int slideInSeconds = 1;// Integer.parseInt(args[5]);
String zkQuorum = "localhost";
String group = "test-consumer-group";
String topicss = "test";
String numThread = "2";
Duration batchInterval = new Duration(5000);
// Duration windowInterval = new Duration(windowInSeconds * 1000);
// Duration slideInterval = new Duration(slideInSeconds * 1000);
SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaWordCount");
JavaStreamingContext jssc =
new JavaStreamingContext(sparkConf, new Duration(2000));
final Broadcast broadcastTableName =
jssc.sparkContext().broadcast(tableName);
final Broadcast broadcastColumnFamily =
jssc.sparkContext().broadcast(columnFamily);
// JavaDStream flumeStream = sc.flumeStream(host, port);
int numThreads = Integer.parseInt(numThread);
Map topicMap = new HashMap();
String[] topics = topicss.split(",");
for (String topic : topics) {
topicMap.put(topic, numThreads);
}
JavaPairReceiverInputDStream messages =
KafkaUtils.createStream(jssc, zkQuorum, group, topicMap);
JavaDStream lines =
messages.map(new Function, String>() {
@Override
public String call(Tuple2 tuple2) {
return tuple2._2();
}
});
JavaDStream words =
lines.flatMap(new FlatMapFunction() {
@Override
public Iterable call(String x) {
return Lists.newArrayList(SPACE.split(x));
}
});
JavaPairDStream lastCounts =
messages.map(new Function, String>() {
@Override
public String call(Tuple2 tuple2) {
return tuple2._2();
}
}).flatMap(new FlatMapFunction() {
@Override
public Iterable call(String x) {
return Lists.newArrayList(SPACE.split(x));
}
}).mapToPair(new PairFunction() {
@Override
public Tuple2 call(String s) {
return new Tuple2(s, 1);
}
}).reduceByKey(new Function2() {
@Override
public Integer call(Integer x, Integer y) throws Exception {
// TODO Auto-generated method stub
return x.intValue() + y.intValue();
}
});
lastCounts
.foreach(new Function2, Time, Void>() {
@Override
public Void call(JavaPairRDD values, Time time)
throws Exception {
values.foreach(new VoidFunction>() {
@Override
public void call(Tuple2 tuple) throws Exception {
HBaseCounterIncrementor incrementor =
HBaseCounterIncrementor.getInstance(
broadcastTableName.value(),
broadcastColumnFamily.value());
incrementor.incerment("Counter", tuple._1(), tuple._2());
System.out.println("Counter:" + tuple._1() + "," + tuple._2());
}
});
return null;
}
});
jssc.start();
}
}