spark streaming使用DataFrames和SQL操作。
使用StreamingContext正在使用的SparkContext创建SparkSession。这样做,以便可以在executed at the driver故障时重新启动。
这是通过创建一个延迟实例化的SparkSession单例实例来完成的。这在以下示例中显示。它修改了早期的单词计数示例,以使用DataFrames和SQL生成单词计数。每个RDD都转换为DataFrame,注册为临时表,然后使用SQL进行查询。
代码使用java语言编写
StreamingWordCountApp.java
package com.imooc.spark;
import org.apache.spark.sql.*;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
/**
* 使用Java开发Spark Streaming应用程序
*/
public class StreamingWordCountApp {
public static void main(String[] args) throws Exception {
SparkConf conf = new SparkConf().setMaster("local[2]")
.setAppName("StreamingWordCountApp");
JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));
// 创建一个DStream(hostname + port)
JavaDStream<String> lines = jssc
.socketTextStream("192.168.3.173", 9999);
// JavaPairDStream<String, Integer> counts = lines.flatMap(line ->
// Arrays.asList(line.split("\t")).iterator())
// .mapToPair(word ->
// new Tuple2<String,Integer>(word, 1))
// .reduceByKey((x,y) -> x+y);
// 输出到控制台
// counts.print();
lines.foreachRDD((rdd, time) -> {
// Get the singleton instance of SparkSession
SparkSession spark = SparkSession.builder().config(rdd.context().getConf()).getOrCreate();
// Convert RDD[String] to RDD[case class] to DataFrame
JavaRDD<JavaRow> rowRDD = rdd.map(word -> {
JavaRow record = new JavaRow();
record.setWord(word);
return record;
});
Dataset wordsDataFrame = spark.createDataFrame(rowRDD, JavaRow.class);
// Creates a temporary view using the DataFrame
wordsDataFrame.createOrReplaceTempView("words");
// Do word count on table using SQL and print it
Dataset wordCountsDataFrame =
spark.sql("select word, count(*) as total from words group by word");
wordCountsDataFrame.show();
});
jssc.start();
jssc.awaitTermination();
}
}
JavaRow.java
package com.imooc.spark;
public class JavaRow implements java.io.Serializable {
private String word;
public String getWord() {
return word;
}
public void setWord(String word) {
this.word = word;
}
}
运行命令:
执行spark streaming程序结果显示如下:
官网: http://spark.apache.org/docs/2.3.0/streaming-programming-guide.html