一、非SQL实现
1. scala实现spark流计算程序,统计每2秒钟的单词个数
1)scala代码部分:
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* spark 流计算程序,统计每2秒钟的单词个数
*/
object SparkStreamingWordCountScala {
def main(args: Array[String]) :Unit={
val conf=new SparkConf()
conf.setAppName("streamWordCount")
conf.setMaster("local[*]")
//流上下文
val ssc=new StreamingContext(conf,Seconds(2))
//创建套接字文本流
val lines=ssc.socketTextStream("master",8888)
//单词序列
val words=lines.flatMap(_.split(" "))
//标一成对
val pair=words.map((_,1))
//统计每个单词个数
val result=pair.reduceByKey(_+_)
result.print()
//启动上下文
ssc.start()
ssc.awaitTermination()
}
}
2)运行程序并在在centos上开启nc, 在nc里输入测试数据如下
控制台显示结果:
2. Java实现spark流计算程序,统计每2秒钟的单词个数
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import scala.Tuple2;
import java.util.Arrays;
import java.util.Iterator;
/**
* java实现spark流计算程序,统计每2秒钟的单词个数
*/
public class SparkStreamingWordCountJava{
public static void main(String[] args) throws InterruptedException {
SparkConf conf=new SparkConf();
conf.setAppName("streaming");
conf.setMaster("local[*]");
//创建java流上下文
JavaStreamingContext ssc=new JavaStreamingContext(conf, Durations.seconds(2));
//创建socket文本流
JavaDStream<String> lines=ssc.socketTextStream("master",8888);
//压扁成单词流
JavaDStream<String> words=lines.flatMap(new FlatMapFunction<String, String>() {
public Iterator<String> call(String s) throws Exception {
return Arrays.asList(s.split(" ")).iterator();
}
});
//成对流
JavaPairDStream<String,Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<String,Integer>(s,1);
}
});
//聚合
JavaPairDStream<String,Integer> result = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
public Integer call(Integer v1, Integer v2) throws Exception {
return v1+v2;
}
});
result.print();
ssc.start();
ssc.awaitTermination();
}
}
测试方法和scala方法一致。
二、SQL实现
1.scala的SQL实现
import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.expressions.Second import org.apache.spark.streaming.{Seconds, StreamingContext} /** * spark streaming整合SQLS实现wordcount */ object SparkStreamingSQLWordCountScala { def main(args: Array[String]): Unit = { val conf = new SparkConf() conf.setAppName("streamingWorldCount") conf.setMaster("local[*]") //创建SparkSession val spark = SparkSession.builder().config(conf).getOrCreate() //流上下文 val ssc = new StreamingContext(spark.sparkContext , Seconds(2)) //创建套接字文本流 val lines = ssc.socketTextStream("localhost" , 8888) //压扁编程单词流 val words = lines.flatMap(_.split(" ")) words.foreachRDD(rdd=>{ import spark.implicits._ val df = rdd.toDF("word") df.createOrReplaceTempView("_doc") spark.sql("select word , count(*) from _doc group by word").show(1000,false) }) //启动上下文 ssc.start() //等待停止 ssc.awaitTermination() } }
2.Java的SQL实现
import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.VoidFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.Metadata; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import org.apache.spark.streaming.Durations; import org.apache.spark.streaming.api.java.JavaDStream; import org.apache.spark.streaming.api.java.JavaStreamingContext; import java.util.Arrays; import java.util.Iterator; /** * Created by Administrator on 2019-7-19. */ public class SparkStreamingSQLWordCountJava { public static void main(String[] args) throws InterruptedException { SparkConf conf = new SparkConf() ; conf.setAppName("streaming") ; conf.setMaster("local[2]") ; //创建sparksession对象 final SparkSession spark = SparkSession.builder().config(conf).getOrCreate() ; //创建java流上下文 JavaStreamingContext ssc = new JavaStreamingContext(new JavaSparkContext(spark.sparkContext()) , Durations.seconds(2)) ; //创建socket文本流 JavaDStream<String> lines = ssc.socketTextStream("localhost" , 8888) ; //压扁成对 JavaDStream<String> words= lines.flatMap(new FlatMapFunction<String, String>() { public Iterator<String> call(String s) throws Exception { return Arrays.asList(s.split(" ")).iterator(); } }) ; words.foreachRDD(new VoidFunction<JavaRDD<String>>() { public void call(JavaRDD<String> rdd) throws Exception { // JavaRDD<Row> rdd2 = rdd.map(new Function<String, Row>() { public Row call(String v1) throws Exception { return RowFactory.create(v1); } }) ; //转换rdd到DataFrame StructField[] fields = new StructField[1] ; fields[0] = new StructField("word" , DataTypes.StringType , true , Metadata.empty()) ; StructType type = new StructType(fields) ; Dataset<Row> df = spark.createDataFrame(rdd2 , type) ; //注册临时视图 df.createOrReplaceTempView("_doc"); spark.sql("select word , count(*) from _doc group by word").show(100,false) ; } }); ssc.start(); ssc.awaitTermination(); } }