updateStateByKey
updateStateByKey操作允许您在使用新的信息持续更新时保持任意状态
public class UpdateStateByKeyWordCount {
public static void main(String[] args) throws InterruptedException {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("UpdateStateByKeyWordCount");
JavaStreamingContext jsc = new JavaStreamingContext(conf, Durations.seconds(5));
// 1.开启checkpoint机制 长期保存一份key的state的话,那么spark
// streaming是要求必须用checkpoint的,以便于在内存数据丢失的时候,可以从checkpoint中恢复数据
jsc.checkpoint("hdfs://spark1:9000/checkpoint");
HashMap<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("metadata.broker.list", "spark1:9092,spark2:9092,spark3:9092");
Set<String> topics = new HashSet<String>();
topics.add("WordCount");
JavaPairInputDStream<String, String> lines = KafkaUtils.createDirectStream(jsc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topics);
lines.flatMap(new FlatMapFunction<Tuple2<String, String>, String>() {
private static final long serialVersionUID = 1L;
@Override
public Iterator<String> call(Tuple2<String, String> t) throws Exception {
return Arrays.asList(t._2.split(" ")).iterator();
}
}).mapToPair(new PairFunction<String, String, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, Integer> call(String t) throws Exception {
return new Tuple2<String, Integer>(t, 1);
}
}).updateStateByKey(new Function2<List<Integer>, Optional<Integer>, Optional<Integer>>() {
private static final long serialVersionUID = 6739658684229224595L;
/**
* values:相当于是这个batch中,这个key的新的值,可能有多个吧。 state:指的是这个key之前的状态
*/
@Override
public Optional<Integer> call(List<Integer> values, Optional<Integer> state) throws Exception {
Integer newValue = 0;
// 判断state是否存在
if (state.isPresent()) {
newValue = state.get();
}
// 将本次新出现的值,都累加到newValue上去
for (Integer value : values) {
newValue += value;
}
return Optional.of(newValue);
}
}).print();
jsc.start();
jsc.awaitTermination();
jsc.close();
}
}
使用到updateStateByKey要开启checkpoint机制
transform
将DStream 转换为一个个底层的RDD ;从而实现将DStream中的RDD到其他类型RDD的任意操作
public class TransformBlacklist {
public static void main(String[] args) throws InterruptedException {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("KafkaDirectWordCount");
JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));
jssc.checkpoint("hdfs://spark1:9000/checkpoint");
HashMap<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("metadata.broker.list", "spark1:9092,spark2:9092,spark3:9092");
Set<String> topics = new HashSet<String>();
topics.add("WordCount");
JavaPairInputDStream<String, String> adsClickLogDStream = KafkaUtils.createDirectStream(jssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topics);
// 模拟的黑名单RDD
List<Tuple2<String, Boolean>> blacklist = new ArrayList<Tuple2<String, Boolean>>();
blacklist.add(new Tuple2<String, Boolean>("tome", true));
JavaPairRDD<String, Boolean> blacklistRDD = jssc.sparkContext().parallelizePairs(blacklist);
adsClickLogDStream.filter(new Function<Tuple2<String, String>, Boolean>() {
private static final long serialVersionUID = -7235497847013109561L;
@Override
public Boolean call(Tuple2<String, String> v1) throws Exception {
if (v1._2.split(" ").length > 1) {
return true;
}
return false;
}
}).mapToPair(new PairFunction<Tuple2<String, String>, String, String>() {
private static final long serialVersionUID = -3077988541495338394L;
@Override
public Tuple2<String, String> call(Tuple2<String, String> t) throws Exception {
return new Tuple2<String, String>(t._2.split(" ")[1], t._2);
}
}).transform(new Function<JavaPairRDD<String, String>, JavaRDD<String>>() {
private static final long serialVersionUID = 1L;
@Override
public JavaRDD<String> call(JavaPairRDD<String, String> userAdsClickLogRDD) throws Exception {
return userAdsClickLogRDD.leftOuterJoin(blacklistRDD).filter(new Function<Tuple2<String, Tuple2<String, Optional<Boolean>>>, Boolean>() {
private static final long serialVersionUID = 1989422109385197246L;
@Override
public Boolean call(Tuple2<String, Tuple2<String, Optional<Boolean>>> v1) throws Exception {
if (v1._2._2.isPresent() && v1._2._2.get()) {
return false;
}
return true;
}
}).map(new Function<Tuple2<String, Tuple2<String, Optional<Boolean>>>, String>() {
private static final long serialVersionUID = 2131045968786323938L;
@Override
public String call(Tuple2<String, Tuple2<String, Optional<Boolean>>> v1) throws Exception {
return v1._2._1();
}
});
}
}).mapToPair(new PairFunction<String, String, Integer>() {
private static final long serialVersionUID = 3905589286163712170L;
@Override
public Tuple2<String, Integer> call(String t) throws Exception {
return new Tuple2<String, Integer>(t, 1);
}
}).updateStateByKey(new Function2<List<Integer>, Optional<Integer>, Optional<Integer>>() {
private static final long serialVersionUID = 1L;
@Override
public Optional<Integer> call(List<Integer> v1, Optional<Integer> state) throws Exception {
Integer newValue = 0;
if (state.isPresent()) {
newValue = state.get();
}
for (Integer value : v1) {
newValue += value;
}
return Optional.of(newValue);
}
}).print();
jssc.start();
jssc.awaitTermination();
jssc.close();
}
}
window滑动窗口
Spark Streaming提供了滑动窗口操作的支持,从而让我们可以对一个滑动窗口内的数据执行计算操作。每次掉落在窗口内的RDD的数据,会被聚合起来执行计算操作,然后生成的RDD,会作为window DStream的一个RDD
网官图中所示,就是对每三秒钟的数据执行一次滑动窗口计算,这3秒内的3个RDD会被聚合起来进行处理,然后过了两秒钟,又会对最近三秒内的数据执行滑动窗口计算。所以每个滑动窗口操作,都必须指定两个参数,窗口长度以及滑动间隔,而且这两个参数值都必须是batch间隔的整数倍。
Spark-Streaming对滑动窗口支持的转换操作:
转化 | 简述 |
---|---|
window(windowLength,slideInterval) | 返回一个新的DStream ,它是基于窗口的源Dstream的batches 集合 |
countByWindow(windowLength,slideInterval) | 返回数据流的滑动窗口中的元素的数量 |
reduceByWindow(func,windowLength,slideInterval) | 在一个滑动间隔内,使用函数func 聚合元素,产生新的氮元素的流。这个func必须是联合交替的,才能正确的并行处理数据。 |
reduceByKeyAndWindow(func,windowLength,slideInterval, [numTasks]) | 当kv 键值对的数据流,返回一个新的kv键值对的新数据流,新数据流每个key通过 给定的reduce 函数func 在一个窗口内进行值得聚合。需要注意的: 这个使用spark默认并行数量(local模式的话是2 ,cluster模式的话取决于 配置参数 spark.default.parallelism)进行分组。你可以传入一个可选的参数 numTasks 参数设置一个不同的task的数量 |
reduceByKeyAndWindow(func, invFunc, windowLength, slideInterval、[numTasks]) | 比上面的reduceByKeyAndWindow 更有效的一个版本,能够在之前window的reduce 值 加上当前窗口计算reduce的值 。这个实现是通过reducing 新进入到窗口的数据,反向reducing 离开窗口的老数据。举个例子,随着窗口的滑动,对key的统计值进行加减。然后这个只适用于可以逆转的函数。也就是说,这些reduce的函数,有一个相关的逆向的函数。注意: 这个操作必须设置 checkpointing。 |
countByValueAndWindow (windowLength, slideInterval, [numTasks]) | 当kv键值对的数据流被调用的时候,返回一个新的kv键值对的数据流。就像 reduceByKeyAndWindow ,reduce的task的数量是可以通过配置修改的。 |
/**
* 热点搜索词滑动统计,每隔10秒钟,统计最近60秒钟的搜索词的搜索频次,并打印出排名最靠前的3个搜索词以及出现次数
*
*/
public class WindowHotWord {
public static void main(String[] args) throws InterruptedException {
LogManager.getRootLogger().setLevel(Level.ERROR);
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("UpdateStateByKeyWordCount");
JavaStreamingContext jsc = new JavaStreamingContext(conf, Durations.seconds(5));
// 1.开启checkpoint机制 长期保存一份key的state的话,那么spark
// streaming是要求必须用checkpoint的,以便于在内存数据丢失的时候,可以从checkpoint中恢复数据
jsc.checkpoint("hdfs://spark1:9000/checkpoint");
HashMap<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("metadata.broker.list", "spark1:9092,spark2:9092,spark3:9092");
Set<String> topics = new HashSet<String>();
topics.add("WordCount");
JavaPairInputDStream<String, String> searchLogsDStream = KafkaUtils.createDirectStream(jsc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topics);
searchLogsDStream.mapToPair(new PairFunction<Tuple2<String, String>, String, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, Integer> call(Tuple2<String, String> searchLog) throws Exception {
return new Tuple2<String, Integer>(searchLog._2.split(" ")[1], 1);
}
}).reduceByKeyAndWindow(new Function2<Integer, Integer, Integer>() {
private static final long serialVersionUID = -974658511061140165L;
/**
* 每隔10秒钟,将最近60秒的数据,作为一个窗口,进行内部的RDD的聚合
*/
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
}, Durations.seconds(60) // 窗口长度,这里是60秒
, Durations.seconds(10) // 滑动间隔,这里是10秒
/**
* 执行transformToPair操作,因为,一个窗口,就是一个60秒钟的数据,会变成一个RDD
*/
).transformToPair(new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() {
private static final long serialVersionUID = 1L;
@Override
public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> searchWordCountsRDD) throws Exception {
List<Tuple2<String, Integer>> hogSearchWordCounts = searchWordCountsRDD.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<Integer, String> call(Tuple2<String, Integer> t) throws Exception {
return new Tuple2<Integer, String>(t._2, t._1);
}
}).sortByKey(false).mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, Integer> call(Tuple2<Integer, String> t) throws Exception {
return new Tuple2<String, Integer>(t._2, t._1);
}
}).take(3);
for (Tuple2<String, Integer> wordCount : hogSearchWordCounts) {
System.out.println(wordCount._1 + ": " + wordCount._2);
}
return searchWordCountsRDD;
}
}).print();
jsc.start();
jsc.awaitTermination();
jsc.close();
}
}
output操作
DStream中的所有计算,都是由output操作触发的,比如print()。如果没有任何output操作,那么,压根儿就不会执行定义的计算逻辑。
此外,即使你使用了foreachRDD output操作,也必须在里面对RDD执行action操作,才能触发对每一个batch的计算逻辑。否则,光有foreachRDD output操作,在里面没有对RDD执行action操作,也不会触发任何逻辑。
Output | Meaning |
---|---|
打印每个batch中的前10个元素,主要用于测试,或者是不需要执行什么output操作时,用于简单触发一下job。 | |
saveAsTextFile(prefix, [suffix]) | 将每个batch的数据保存到文件中。每个batch的文件的命名格式为:prefix-TIME_IN_MS[.suffix] |
saveAsObjectFile | 同上,但是将每个batch的数据以序列化对象的方式,保存到SequenceFile中。 |
saveAsHadoopFile | 同上,将数据保存到Hadoop文件中。 |
foreachRDD | 最常用的output操作,遍历DStream中的每个产生的RDD,进行处理。可以将每个RDD中的数据写入外部存储,比如文件、数据库、缓存等。通常在其中,是针对RDD执行action操作的,比如foreach。 |
public class ConnectionPool {
static LinkedList<Connection> connectionQueue;
static {
try {
Class.forName("com.mysql.jdbc.Driver");
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
}
public synchronized static Connection getConnection() {
if (connectionQueue == null) {
connectionQueue = new LinkedList<Connection>();
for (int i = 0; i < 2; i++) {
try {
Connection conn = DriverManager.getConnection("jdbc:mysql://spark1:3306/test", "", "");
connectionQueue.push(conn);
} catch (SQLException e) {
e.printStackTrace();
}
}
}
return connectionQueue.poll();
}
public static void returnConnection(Connection conn) {
connectionQueue.push(conn);
}
}
public class PersistWordCount {
public static void main(String[] args) throws InterruptedException {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("PersistWordCount");
JavaStreamingContext jsc = new JavaStreamingContext(conf, Durations.seconds(5));
jsc.checkpoint("hdfs://spark1:9000/checkpoint");
HashMap<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("metadata.broker.list", "spark1:9092,spark2:9092,spark3:9092");
Set<String> topics = new HashSet<String>();
topics.add("WordCount");
JavaPairInputDStream<String, String> lines = KafkaUtils.createDirectStream(jsc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topics);
lines.flatMap(new FlatMapFunction<Tuple2<String, String>, String>() {
private static final long serialVersionUID = 1L;
@Override
public Iterator<String> call(Tuple2<String, String> t) throws Exception {
return Arrays.asList(t._2.split(" ")).iterator();
}
}).mapToPair(new PairFunction<String, String, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, Integer> call(String t) throws Exception {
return new Tuple2<String, Integer>(t, 1);
}
}).updateStateByKey(new Function2<List<Integer>, Optional<Integer>, Optional<Integer>>() {
private static final long serialVersionUID = 6739658684229224595L;
@Override
public Optional<Integer> call(List<Integer> values, Optional<Integer> state) throws Exception {
Integer newValue = 0;
if (state.isPresent()) {
newValue = state.get();
}
for (Integer value : values) {
newValue += value;
}
return Optional.of(newValue);
}
}).foreachRDD(new VoidFunction<JavaPairRDD<String, Integer>>() {
private static final long serialVersionUID = 1824829383618695238L;
@Override
public void call(JavaPairRDD<String, Integer> t) throws Exception {
t.foreachPartition(new VoidFunction<Iterator<Tuple2<String, Integer>>>() {
private static final long serialVersionUID = -7675359490340250314L;
@Override
public void call(Iterator<Tuple2<String, Integer>> wordCounts) throws Exception {
Connection conn = ConnectionPool.getConnection();
Tuple2<String, Integer> wordCount;
while (wordCounts.hasNext()) {
wordCount = wordCounts.next();
Statement stmt = conn.createStatement();
stmt.executeUpdate("insert into wordcount(word,count) values ('" + wordCount._1 + "'," + wordCount._2 + ")");
}
ConnectionPool.returnConnection(conn);
}
});
}
});
jsc.start();
jsc.awaitTermination();
jsc.close();
}
}
与Spark SQL结合使用
/**
* 与Spark SQL整合使用,top3热门商品实时统计
*/
public class Top3HotProduct {
public static void main(String[] args) throws InterruptedException {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("UpdateStateByKeyWordCount");
JavaStreamingContext jsc = new JavaStreamingContext(conf, Durations.seconds(5));
jsc.checkpoint("hdfs://spark1:9000/checkpoint");
HashMap<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("metadata.broker.list", "spark1:9092,spark2:9092,spark3:9092");
Set<String> topics = new HashSet<String>();
topics.add("WordCount");
JavaPairInputDStream<String, String> productClickLogsDStream = KafkaUtils.createDirectStream(jsc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topics);
productClickLogsDStream.mapToPair(new PairFunction<Tuple2<String, String>, Tuple2<String, String>, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<Tuple2<String, String>, Integer> call(Tuple2<String, String> t) throws Exception {
String[] productClickLogSplited = t._2.split(" ");
return new Tuple2<Tuple2<String, String>, Integer>(new Tuple2<String, String>(productClickLogSplited[2], productClickLogSplited[1]), 1);
}
}).reduceByKeyAndWindow(new Function2<Integer, Integer, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
}, Durations.seconds(60), Durations.seconds(10)).foreachRDD(new VoidFunction<JavaPairRDD<Tuple2<String, String>, Integer>>() {
private static final long serialVersionUID = -3482150543623000714L;
@Override
public void call(JavaPairRDD<Tuple2<String, String>, Integer> t) throws Exception {
JavaRDD<Row> categoryProductCountRowRDD = t.map(new Function<Tuple2<Tuple2<String, String>, Integer>, Row>() {
private static final long serialVersionUID = 4064805100943824158L;
@Override
public Row call(Tuple2<Tuple2<String, String>, Integer> v1) throws Exception {
return RowFactory.create(v1._1._1, v1._1._2, v1._2);
}
});
List<StructField> structFields = new ArrayList<StructField>();
structFields.add(DataTypes.createStructField("category", DataTypes.StringType, true));
structFields.add(DataTypes.createStructField("product", DataTypes.StringType, true));
structFields.add(DataTypes.createStructField("click_count", DataTypes.IntegerType, true));
SparkSession spark = SparkSession.builder().config(categoryProductCountRowRDD.context().conf()).getOrCreate();
spark.createDataFrame(categoryProductCountRowRDD, DataTypes.createStructType(structFields)).createOrReplaceTempView("product_click_log");
spark.sql(
"SELECT category,product,click_count "
+ "FROM ("
+ "SELECT "
+ "category,"
+ "product,"
+ "click_count,"
+ "row_number() OVER (PARTITION BY category ORDER BY click_count DESC) rank "
+ "FROM product_click_log"
+ ") tmp "
+ "WHERE rank<=3").show();
}
});
jsc.start();
jsc.awaitTermination();
jsc.close();
}
}