Spark Streaming:DStream的transformationy及output操作

updateStateByKey

updateStateByKey操作允许您在使用新的信息持续更新时保持任意状态

public class UpdateStateByKeyWordCount {

	public static void main(String[] args) throws InterruptedException {

		SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("UpdateStateByKeyWordCount");

		JavaStreamingContext jsc = new JavaStreamingContext(conf, Durations.seconds(5));

		// 1.开启checkpoint机制 长期保存一份key的state的话,那么spark
		// streaming是要求必须用checkpoint的,以便于在内存数据丢失的时候,可以从checkpoint中恢复数据
		jsc.checkpoint("hdfs://spark1:9000/checkpoint");

		HashMap<String, String> kafkaParams = new HashMap<String, String>();
		kafkaParams.put("metadata.broker.list", "spark1:9092,spark2:9092,spark3:9092");

		Set<String> topics = new HashSet<String>();
		topics.add("WordCount");

		JavaPairInputDStream<String, String> lines = KafkaUtils.createDirectStream(jsc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topics);

		lines.flatMap(new FlatMapFunction<Tuple2<String, String>, String>() {

			private static final long serialVersionUID = 1L;

			@Override
			public Iterator<String> call(Tuple2<String, String> t) throws Exception {
				return Arrays.asList(t._2.split(" ")).iterator();
			}
		}).mapToPair(new PairFunction<String, String, Integer>() {

			private static final long serialVersionUID = 1L;

			@Override
			public Tuple2<String, Integer> call(String t) throws Exception {
				return new Tuple2<String, Integer>(t, 1);
			}
		}).updateStateByKey(new Function2<List<Integer>, Optional<Integer>, Optional<Integer>>() {

			private static final long serialVersionUID = 6739658684229224595L;

			/**
			 * values:相当于是这个batch中,这个key的新的值,可能有多个吧。 state:指的是这个key之前的状态
			 */
			@Override
			public Optional<Integer> call(List<Integer> values, Optional<Integer> state) throws Exception {
				Integer newValue = 0;
				// 判断state是否存在
				if (state.isPresent()) {
					newValue = state.get();
				}
				// 将本次新出现的值,都累加到newValue上去
				for (Integer value : values) {
					newValue += value;
				}

				return Optional.of(newValue);
			}
		}).print();

		jsc.start();

		jsc.awaitTermination();

		jsc.close();

	}

}

使用到updateStateByKey要开启checkpoint机制

transform

将DStream 转换为一个个底层的RDD ;从而实现将DStream中的RDD到其他类型RDD的任意操作

public class TransformBlacklist {

	public static void main(String[] args) throws InterruptedException {

		SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("KafkaDirectWordCount");

		JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));

		jssc.checkpoint("hdfs://spark1:9000/checkpoint");

		HashMap<String, String> kafkaParams = new HashMap<String, String>();
		kafkaParams.put("metadata.broker.list", "spark1:9092,spark2:9092,spark3:9092");

		Set<String> topics = new HashSet<String>();
		topics.add("WordCount");

		JavaPairInputDStream<String, String> adsClickLogDStream = KafkaUtils.createDirectStream(jssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topics);

		// 模拟的黑名单RDD
		List<Tuple2<String, Boolean>> blacklist = new ArrayList<Tuple2<String, Boolean>>();
		blacklist.add(new Tuple2<String, Boolean>("tome", true));

		JavaPairRDD<String, Boolean> blacklistRDD = jssc.sparkContext().parallelizePairs(blacklist);

		adsClickLogDStream.filter(new Function<Tuple2<String, String>, Boolean>() {

			private static final long serialVersionUID = -7235497847013109561L;

			@Override
			public Boolean call(Tuple2<String, String> v1) throws Exception {
				if (v1._2.split(" ").length > 1) {
					return true;
				}
				return false;
			}
		}).mapToPair(new PairFunction<Tuple2<String, String>, String, String>() {

			private static final long serialVersionUID = -3077988541495338394L;

			@Override
			public Tuple2<String, String> call(Tuple2<String, String> t) throws Exception {
				return new Tuple2<String, String>(t._2.split(" ")[1], t._2);
			}
		}).transform(new Function<JavaPairRDD<String, String>, JavaRDD<String>>() {

			private static final long serialVersionUID = 1L;

			@Override
			public JavaRDD<String> call(JavaPairRDD<String, String> userAdsClickLogRDD) throws Exception {

				return userAdsClickLogRDD.leftOuterJoin(blacklistRDD).filter(new Function<Tuple2<String, Tuple2<String, Optional<Boolean>>>, Boolean>() {

					private static final long serialVersionUID = 1989422109385197246L;

					@Override
					public Boolean call(Tuple2<String, Tuple2<String, Optional<Boolean>>> v1) throws Exception {
						if (v1._2._2.isPresent() && v1._2._2.get()) {
							return false;
						}
						return true;
					}
				}).map(new Function<Tuple2<String, Tuple2<String, Optional<Boolean>>>, String>() {

					private static final long serialVersionUID = 2131045968786323938L;

					@Override
					public String call(Tuple2<String, Tuple2<String, Optional<Boolean>>> v1) throws Exception {
						return v1._2._1();
					}
				});
			}
		}).mapToPair(new PairFunction<String, String, Integer>() {

			private static final long serialVersionUID = 3905589286163712170L;

			@Override
			public Tuple2<String, Integer> call(String t) throws Exception {
				return new Tuple2<String, Integer>(t, 1);
			}

		}).updateStateByKey(new Function2<List<Integer>, Optional<Integer>, Optional<Integer>>() {

			private static final long serialVersionUID = 1L;

			@Override
			public Optional<Integer> call(List<Integer> v1, Optional<Integer> state) throws Exception {

				Integer newValue = 0;

				if (state.isPresent()) {
					newValue = state.get();
				}

				for (Integer value : v1) {
					newValue += value;
				}

				return Optional.of(newValue);
			}
		}).print();

		jssc.start();
		jssc.awaitTermination();
		jssc.close();

	}
}

window滑动窗口

Spark Streaming提供了滑动窗口操作的支持,从而让我们可以对一个滑动窗口内的数据执行计算操作。每次掉落在窗口内的RDD的数据,会被聚合起来执行计算操作,然后生成的RDD,会作为window DStream的一个RDD
在这里插入图片描述
网官图中所示,就是对每三秒钟的数据执行一次滑动窗口计算,这3秒内的3个RDD会被聚合起来进行处理,然后过了两秒钟,又会对最近三秒内的数据执行滑动窗口计算。所以每个滑动窗口操作,都必须指定两个参数,窗口长度以及滑动间隔,而且这两个参数值都必须是batch间隔的整数倍。

Spark-Streaming对滑动窗口支持的转换操作:

转化简述
window(windowLength,slideInterval)返回一个新的DStream ,它是基于窗口的源Dstream的batches 集合
countByWindow(windowLength,slideInterval)返回数据流的滑动窗口中的元素的数量
reduceByWindow(func,windowLength,slideInterval)在一个滑动间隔内,使用函数func 聚合元素,产生新的氮元素的流。这个func必须是联合交替的,才能正确的并行处理数据。
reduceByKeyAndWindow(func,windowLength,slideInterval, [numTasks])当kv 键值对的数据流,返回一个新的kv键值对的新数据流,新数据流每个key通过 给定的reduce 函数func 在一个窗口内进行值得聚合。需要注意的: 这个使用spark默认并行数量(local模式的话是2 ,cluster模式的话取决于 配置参数 spark.default.parallelism)进行分组。你可以传入一个可选的参数 numTasks 参数设置一个不同的task的数量
reduceByKeyAndWindow(func, invFunc, windowLength, slideInterval、[numTasks])比上面的reduceByKeyAndWindow 更有效的一个版本,能够在之前window的reduce 值 加上当前窗口计算reduce的值 。这个实现是通过reducing 新进入到窗口的数据,反向reducing 离开窗口的老数据。举个例子,随着窗口的滑动,对key的统计值进行加减。然后这个只适用于可以逆转的函数。也就是说,这些reduce的函数,有一个相关的逆向的函数。注意: 这个操作必须设置 checkpointing。
countByValueAndWindow (windowLength, slideInterval, [numTasks])当kv键值对的数据流被调用的时候,返回一个新的kv键值对的数据流。就像 reduceByKeyAndWindow ,reduce的task的数量是可以通过配置修改的。
/**
 * 热点搜索词滑动统计,每隔10秒钟,统计最近60秒钟的搜索词的搜索频次,并打印出排名最靠前的3个搜索词以及出现次数
 * 
 */
public class WindowHotWord {

	public static void main(String[] args) throws InterruptedException {

		LogManager.getRootLogger().setLevel(Level.ERROR);

		SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("UpdateStateByKeyWordCount");

		JavaStreamingContext jsc = new JavaStreamingContext(conf, Durations.seconds(5));

		// 1.开启checkpoint机制 长期保存一份key的state的话,那么spark
		// streaming是要求必须用checkpoint的,以便于在内存数据丢失的时候,可以从checkpoint中恢复数据
		jsc.checkpoint("hdfs://spark1:9000/checkpoint");

		HashMap<String, String> kafkaParams = new HashMap<String, String>();
		kafkaParams.put("metadata.broker.list", "spark1:9092,spark2:9092,spark3:9092");

		Set<String> topics = new HashSet<String>();
		topics.add("WordCount");

		JavaPairInputDStream<String, String> searchLogsDStream = KafkaUtils.createDirectStream(jsc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topics);

		searchLogsDStream.mapToPair(new PairFunction<Tuple2<String, String>, String, Integer>() {

			private static final long serialVersionUID = 1L;

			@Override
			public Tuple2<String, Integer> call(Tuple2<String, String> searchLog) throws Exception {
				return new Tuple2<String, Integer>(searchLog._2.split(" ")[1], 1);
			}
		}).reduceByKeyAndWindow(new Function2<Integer, Integer, Integer>() {

			private static final long serialVersionUID = -974658511061140165L;

			/**
			 * 每隔10秒钟,将最近60秒的数据,作为一个窗口,进行内部的RDD的聚合
			 */
			@Override
			public Integer call(Integer v1, Integer v2) throws Exception {
				return v1 + v2;
			}

		}, Durations.seconds(60) // 窗口长度,这里是60秒
				, Durations.seconds(10) // 滑动间隔,这里是10秒
		/**
		 * 执行transformToPair操作,因为,一个窗口,就是一个60秒钟的数据,会变成一个RDD
		 */
		).transformToPair(new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() {

			private static final long serialVersionUID = 1L;

			@Override
			public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> searchWordCountsRDD) throws Exception {

				List<Tuple2<String, Integer>> hogSearchWordCounts = searchWordCountsRDD.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() {

					private static final long serialVersionUID = 1L;

					@Override
					public Tuple2<Integer, String> call(Tuple2<String, Integer> t) throws Exception {
						return new Tuple2<Integer, String>(t._2, t._1);
					}
				}).sortByKey(false).mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() {

					private static final long serialVersionUID = 1L;

					@Override
					public Tuple2<String, Integer> call(Tuple2<Integer, String> t) throws Exception {
						return new Tuple2<String, Integer>(t._2, t._1);
					}
				}).take(3);

				for (Tuple2<String, Integer> wordCount : hogSearchWordCounts) {
					System.out.println(wordCount._1 + ": " + wordCount._2);
				}

				return searchWordCountsRDD;
			}
		}).print();

		jsc.start();

		jsc.awaitTermination();

		jsc.close();
	}

}

output操作

DStream中的所有计算,都是由output操作触发的,比如print()。如果没有任何output操作,那么,压根儿就不会执行定义的计算逻辑。

此外,即使你使用了foreachRDD output操作,也必须在里面对RDD执行action操作,才能触发对每一个batch的计算逻辑。否则,光有foreachRDD output操作,在里面没有对RDD执行action操作,也不会触发任何逻辑。

OutputMeaning
print打印每个batch中的前10个元素,主要用于测试,或者是不需要执行什么output操作时,用于简单触发一下job。
saveAsTextFile(prefix, [suffix])将每个batch的数据保存到文件中。每个batch的文件的命名格式为:prefix-TIME_IN_MS[.suffix]
saveAsObjectFile同上,但是将每个batch的数据以序列化对象的方式,保存到SequenceFile中。
saveAsHadoopFile同上,将数据保存到Hadoop文件中。
foreachRDD最常用的output操作,遍历DStream中的每个产生的RDD,进行处理。可以将每个RDD中的数据写入外部存储,比如文件、数据库、缓存等。通常在其中,是针对RDD执行action操作的,比如foreach。
public class ConnectionPool {

	static LinkedList<Connection> connectionQueue;

	static {
		try {
			Class.forName("com.mysql.jdbc.Driver");
		} catch (ClassNotFoundException e) {
			e.printStackTrace();
		}
	}

	public synchronized static Connection getConnection() {
		if (connectionQueue == null) {
			connectionQueue = new LinkedList<Connection>();
			for (int i = 0; i < 2; i++) {
				try {
					Connection conn = DriverManager.getConnection("jdbc:mysql://spark1:3306/test", "", "");
					connectionQueue.push(conn);
				} catch (SQLException e) {
					e.printStackTrace();
				}
			}
		}
		return connectionQueue.poll();
	}

	public static void returnConnection(Connection conn) {
		connectionQueue.push(conn);
	}
}
public class PersistWordCount {

	public static void main(String[] args) throws InterruptedException {

		SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("PersistWordCount");

		JavaStreamingContext jsc = new JavaStreamingContext(conf, Durations.seconds(5));

		jsc.checkpoint("hdfs://spark1:9000/checkpoint");

		HashMap<String, String> kafkaParams = new HashMap<String, String>();
		kafkaParams.put("metadata.broker.list", "spark1:9092,spark2:9092,spark3:9092");

		Set<String> topics = new HashSet<String>();
		topics.add("WordCount");

		JavaPairInputDStream<String, String> lines = KafkaUtils.createDirectStream(jsc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topics);

		lines.flatMap(new FlatMapFunction<Tuple2<String, String>, String>() {

			private static final long serialVersionUID = 1L;

			@Override
			public Iterator<String> call(Tuple2<String, String> t) throws Exception {
				return Arrays.asList(t._2.split(" ")).iterator();
			}
		}).mapToPair(new PairFunction<String, String, Integer>() {

			private static final long serialVersionUID = 1L;

			@Override
			public Tuple2<String, Integer> call(String t) throws Exception {
				return new Tuple2<String, Integer>(t, 1);
			}
		}).updateStateByKey(new Function2<List<Integer>, Optional<Integer>, Optional<Integer>>() {

			private static final long serialVersionUID = 6739658684229224595L;

			@Override
			public Optional<Integer> call(List<Integer> values, Optional<Integer> state) throws Exception {
				Integer newValue = 0;
				if (state.isPresent()) {
					newValue = state.get();
				}
				for (Integer value : values) {
					newValue += value;
				}

				return Optional.of(newValue);
			}
		}).foreachRDD(new VoidFunction<JavaPairRDD<String, Integer>>() {

			private static final long serialVersionUID = 1824829383618695238L;

			@Override
			public void call(JavaPairRDD<String, Integer> t) throws Exception {
				t.foreachPartition(new VoidFunction<Iterator<Tuple2<String, Integer>>>() {

					private static final long serialVersionUID = -7675359490340250314L;

					@Override
					public void call(Iterator<Tuple2<String, Integer>> wordCounts) throws Exception {
						Connection conn = ConnectionPool.getConnection();
						Tuple2<String, Integer> wordCount;
						while (wordCounts.hasNext()) {
							wordCount = wordCounts.next();
							Statement stmt = conn.createStatement();
							stmt.executeUpdate("insert into wordcount(word,count) values ('" + wordCount._1 + "'," + wordCount._2 + ")");
						}
						ConnectionPool.returnConnection(conn);
					}
				});
			}
		});

		jsc.start();
		jsc.awaitTermination();
		jsc.close();
	}
}

与Spark SQL结合使用

/**
 * 与Spark SQL整合使用,top3热门商品实时统计
 */
public class Top3HotProduct {

	public static void main(String[] args) throws InterruptedException {
		SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("UpdateStateByKeyWordCount");

		JavaStreamingContext jsc = new JavaStreamingContext(conf, Durations.seconds(5));

		jsc.checkpoint("hdfs://spark1:9000/checkpoint");

		HashMap<String, String> kafkaParams = new HashMap<String, String>();
		kafkaParams.put("metadata.broker.list", "spark1:9092,spark2:9092,spark3:9092");

		Set<String> topics = new HashSet<String>();
		topics.add("WordCount");

		JavaPairInputDStream<String, String> productClickLogsDStream = KafkaUtils.createDirectStream(jsc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topics);

		productClickLogsDStream.mapToPair(new PairFunction<Tuple2<String, String>, Tuple2<String, String>, Integer>() {

			private static final long serialVersionUID = 1L;

			@Override
			public Tuple2<Tuple2<String, String>, Integer> call(Tuple2<String, String> t) throws Exception {
				String[] productClickLogSplited = t._2.split(" ");
				return new Tuple2<Tuple2<String, String>, Integer>(new Tuple2<String, String>(productClickLogSplited[2], productClickLogSplited[1]), 1);
			}
		}).reduceByKeyAndWindow(new Function2<Integer, Integer, Integer>() {

			private static final long serialVersionUID = 1L;

			@Override
			public Integer call(Integer v1, Integer v2) throws Exception {
				return v1 + v2;
			}
		}, Durations.seconds(60), Durations.seconds(10)).foreachRDD(new VoidFunction<JavaPairRDD<Tuple2<String, String>, Integer>>() {

			private static final long serialVersionUID = -3482150543623000714L;

			@Override
			public void call(JavaPairRDD<Tuple2<String, String>, Integer> t) throws Exception {

				JavaRDD<Row> categoryProductCountRowRDD = t.map(new Function<Tuple2<Tuple2<String, String>, Integer>, Row>() {

					private static final long serialVersionUID = 4064805100943824158L;

					@Override
					public Row call(Tuple2<Tuple2<String, String>, Integer> v1) throws Exception {
						return RowFactory.create(v1._1._1, v1._1._2, v1._2);
					}
				});

				List<StructField> structFields = new ArrayList<StructField>();
				structFields.add(DataTypes.createStructField("category", DataTypes.StringType, true));
				structFields.add(DataTypes.createStructField("product", DataTypes.StringType, true));
				structFields.add(DataTypes.createStructField("click_count", DataTypes.IntegerType, true));

				SparkSession spark = SparkSession.builder().config(categoryProductCountRowRDD.context().conf()).getOrCreate();

				spark.createDataFrame(categoryProductCountRowRDD, DataTypes.createStructType(structFields)).createOrReplaceTempView("product_click_log");

				spark.sql(
						"SELECT category,product,click_count "
						+ "FROM ("
							+ "SELECT "
								+ "category,"
								+ "product,"
								+ "click_count,"
								+ "row_number() OVER (PARTITION BY category ORDER BY click_count DESC) rank "
							+ "FROM product_click_log"  
						+ ") tmp "
						+ "WHERE rank<=3").show();
			}
		});

		jsc.start();
		jsc.awaitTermination();
		jsc.close();
	}
}
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值