scala版本
package nj.zb.spark
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* @Title: ${XiongJinbiao}
* @Package ${nj.zb}
* @Description: ${balabala}
* @author XiongJinbiao
* @date 2020/12/18 001817:05
*/
object SparkStreamDemo1 {
def main(args: Array[String]): Unit = {
val sparkStreamConf: SparkConf
= new SparkConf().setMaster("local[*]").setAppName("sparkStream")
// 采集周期,指定的3秒为每次采集的时间间隔
val streamingContext = new StreamingContext(sparkStreamConf,Seconds(3))
// 指定采集的方法
val socketLineStream: ReceiverInputDStream[String]
= streamingContext.socketTextStream("192.168.153.10",7777)
// 将采集来的信息进行处理,统计数据(wordcount)
val wordStream: DStream[String] = socketLineStream.flatMap(line => line.split("\\s+"))
val wordcount: DStream[(String, Int)] = wordStream.map(x=>(x,1)).reduceByKey(_+_)
// 打印
wordcount.print()
// 启动采集器
streamingContext.start()
streamingContext.awaitTermination()
}
}
java版本
package nj.zb.spark;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import scala.Tuple2;
import java.util.Arrays;
import java.util.Iterator;
/**
* @author XiongJinbiao
* @Title: ${XiongJinbiao}
* @Package ${nj.zb}
* @Description: ${balabala}
* @date 2020/12/19 001912:33
*/
public class JavaSparkStreamDemo1 {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("wordcount");
JavaStreamingContext jsc = new JavaStreamingContext(conf, Durations.seconds(3));
// 创建一个DStream来表示TCP源的流数据,指定主机名和端口号
JavaReceiverInputDStream<String> lines = jsc.socketTextStream("192.168.153.10", 7777);
// linesDStream表示将从数据服务器接收的数据流。此流中的每条记录都是一行文本。然后用空格分割为单词
// flatMap是一个DStream操作,通过从源DStream中的每个记录生成多个新记录来创建新的DStream。
// 使用FlatMapFunction对象定义了转换Java API中有许多这样的便利类可以帮助定义DStream转换。
// DStream是RDD产生的模板,在Spark Streaming发生计算前,其实质是把每个Batch的DStream的操作翻译成为了RDD操作
JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String s) throws Exception {
String[] split = s.split("\\s+");
return Arrays.asList(split).iterator();
}
});
// 将word变为(word,1)
JavaPairDStream<String, Integer> map = words.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<>(s, 1);
}
});
// 统计
JavaPairDStream<String, Integer> wordcount = map.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer integer, Integer integer2) throws Exception {
return integer + integer2;
}
});
// 打印
wordcount.print();
// 开始
jsc.start();
try {
// 等待计算终止
jsc.awaitTermination();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
在命令行输入 nc -lk 7777后
输如内容,输出如下
-------------------------------------------
Time: 1608361890000 ms
-------------------------------------------
(java,1)
(heelo,1)
-------------------------------------------
Time: 1608361893000 ms
-------------------------------------------
-------------------------------------------
Time: 1608361896000 ms
-------------------------------------------
(soa,1)
(hello,1)
-------------------------------------------
Time: 1608361899000 ms
-------------------------------------------
(a,1)
(nihao,1)
-------------------------------------------
Time: 1608361902000 ms
-------------------------------------------
-------------------------------------------
Time: 1608361905000 ms
-------------------------------------------
(i,1)
(love,1)
(you,1)
(babay,1)