SparkStream实现wordcount详解(java和scala篇)

scala版本

package nj.zb.spark

import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}

/**
  * @Title: ${XiongJinbiao}
  * @Package ${nj.zb}
  * @Description: ${balabala}
  * @author XiongJinbiao
  * @date 2020/12/18 001817:05
  */
object SparkStreamDemo1 {
  def main(args: Array[String]): Unit = {
    val sparkStreamConf: SparkConf
    = new SparkConf().setMaster("local[*]").setAppName("sparkStream")

    // 采集周期,指定的3秒为每次采集的时间间隔
    val streamingContext = new StreamingContext(sparkStreamConf,Seconds(3))

    // 指定采集的方法
    val socketLineStream: ReceiverInputDStream[String]
    = streamingContext.socketTextStream("192.168.153.10",7777)

    // 将采集来的信息进行处理,统计数据(wordcount)
    val wordStream: DStream[String] = socketLineStream.flatMap(line => line.split("\\s+"))

    val wordcount: DStream[(String, Int)] = wordStream.map(x=>(x,1)).reduceByKey(_+_)

    // 打印
    wordcount.print()

    // 启动采集器
    streamingContext.start()
    streamingContext.awaitTermination()
  }
}

java版本

package nj.zb.spark;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import scala.Tuple2;

import java.util.Arrays;
import java.util.Iterator;

/**
 * @author XiongJinbiao
 * @Title: ${XiongJinbiao}
 * @Package ${nj.zb}
 * @Description: ${balabala}
 * @date 2020/12/19 001912:33
 */
public class JavaSparkStreamDemo1 {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("wordcount");
        JavaStreamingContext jsc = new JavaStreamingContext(conf, Durations.seconds(3));

        // 创建一个DStream来表示TCP源的流数据,指定主机名和端口号
        JavaReceiverInputDStream<String> lines = jsc.socketTextStream("192.168.153.10", 7777);

        // linesDStream表示将从数据服务器接收的数据流。此流中的每条记录都是一行文本。然后用空格分割为单词
        // flatMap是一个DStream操作,通过从源DStream中的每个记录生成多个新记录来创建新的DStream。
        // 使用FlatMapFunction对象定义了转换Java API中有许多这样的便利类可以帮助定义DStream转换。
        // DStream是RDD产生的模板,在Spark Streaming发生计算前,其实质是把每个Batch的DStream的操作翻译成为了RDD操作
        JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
            @Override
            public Iterator<String> call(String s) throws Exception {
                String[] split = s.split("\\s+");
                return Arrays.asList(split).iterator();
            }
        });

        // 将word变为(word,1)
        JavaPairDStream<String, Integer> map = words.mapToPair(new PairFunction<String, String, Integer>() {
            @Override
            public Tuple2<String, Integer> call(String s) throws Exception {
                return new Tuple2<>(s, 1);
            }
        });

        // 统计
        JavaPairDStream<String, Integer> wordcount = map.reduceByKey(new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer integer, Integer integer2) throws Exception {
                return integer + integer2;
            }
        });

        // 打印
        wordcount.print();

        // 开始
        jsc.start();
        try {
            // 等待计算终止
            jsc.awaitTermination();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
}

在命令行输入 nc -lk 7777后
输如内容,输出如下

-------------------------------------------
Time: 1608361890000 ms
-------------------------------------------
(java,1)
(heelo,1)

-------------------------------------------
Time: 1608361893000 ms
-------------------------------------------

-------------------------------------------
Time: 1608361896000 ms
-------------------------------------------
(soa,1)
(hello,1)

-------------------------------------------
Time: 1608361899000 ms
-------------------------------------------
(a,1)
(nihao,1)

-------------------------------------------
Time: 1608361902000 ms
-------------------------------------------

-------------------------------------------
Time: 1608361905000 ms
-------------------------------------------
(i,1)
(love,1)
(you,1)
(babay,1)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值