4.1.2 Flink-流处理框架-wordCount的批处理和流处理

目录

1.pom依赖

2.批处理wordCount

3.流处理wordCount2(读文本还是批处理)

4.利用netcat模拟生产数据,流式处理


1.pom依赖

<dependencies>
        <!-- https://mvnrepository.com/artifact/org.apache.flink/flink-java -->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>1.10.1</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.flink/flink-streaming-java -->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_2.11</artifactId>
            <version>1.10.1</version>
            <!--            <scope>provided</scope>-->
        </dependency>
</dependencies>

2.批处理wordCount

package com.ucas.wc;

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.AggregateOperator;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.operators.FlatMapOperator;
import org.apache.flink.api.java.operators.UnsortedGrouping;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;

/**
 * @author GONG
 * @version 1.0
 * @date 2020/12/21 16:25
 * 批处理wordCount
 */
public class WordCount {
    public static void main(String[] args) throws Exception {
        //创建执行环境
        ExecutionEnvironment executionEnvironment = ExecutionEnvironment.getExecutionEnvironment();
        //从文件中读取数据
        String inputPath = "D:\\IDEA_WORK\\flinkstudy\\src\\main\\resources\\hello.txt";
        //数据源:本质是继承了dataSet
        DataSource<String> stringDataSource = executionEnvironment.readTextFile(inputPath);
        DataSet<String> inputDataSet = stringDataSource;

        //对数据集进行处理(按行处理,按照空格分词,转二元组)(word,1)
        FlatMapOperator<String, Tuple2<String, Integer>> stringTuple2FlatMapOperator = inputDataSet.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
            public void flatMap(String line, Collector<Tuple2<String, Integer>> out) throws Exception {
                //按照空格进行分词
                String[] words = line.split(" ");
                for (String word : words) {
                    // 输出数据
                    out.collect(new Tuple2<String, Integer>(word, 1));
                }
            }
        });

        //根据word分组groupBy,按照第一个位置的word进行分组
        UnsortedGrouping<Tuple2<String, Integer>> tuple2UnsortedGrouping = stringTuple2FlatMapOperator.groupBy(0);

        //按照第二个位置求和
        AggregateOperator<Tuple2<String, Integer>> sum = tuple2UnsortedGrouping.sum(1);

        //打印输出
        sum.print();
    }
}

3.流处理wordCount2(读文本还是批处理)

package com.ucas.wc;

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;

/**
 * @author GONG
 * @version 1.0
 * @date 2020/12/21 16:25
 * 流处理wordCount2
 */
public class WordCount2 {
    public static void main(String[] args) throws Exception {
        //创建流处理执行环境
        StreamExecutionEnvironment executionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment();
        executionEnvironment.setParallelism(4);//设置并行度,默认是电脑核心数

        //从文件中读取数据
        String inputPath = "D:\\IDEA_WORK\\flinkstudy\\src\\main\\resources\\hello.txt";
        //数据源:DataStreamSource流式算子,本质是继承了dataSet
        DataStreamSource<String> stringDataStreamSource = executionEnvironment.readTextFile(inputPath);
        DataStream<String> inputDataStream = stringDataStreamSource;

        //基于DataStream转换操作
        SingleOutputStreamOperator<Tuple2<String, Integer>> tuple2SingleOutputStreamOperator = inputDataStream.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
            public void flatMap(String line, Collector<Tuple2<String, Integer>> out) throws Exception {
                String[] words = line.split(" ");
                for (String word : words) {
                    Tuple2<String, Integer> stringIntegerTuple2 = new Tuple2<String, Integer>(word, 1);
                    out.collect(stringIntegerTuple2);
                }
            }
        });

        KeyedStream<Tuple2<String, Integer>, Tuple> tuple2TupleKeyedStream = tuple2SingleOutputStreamOperator.keyBy(0);
        SingleOutputStreamOperator<Tuple2<String, Integer>> sum = tuple2TupleKeyedStream.sum(1);
        sum.print();

        //执行任务
        executionEnvironment.execute();
    }
}

上面输出数据的最左侧代表并行度,flink默认的并行度是4。

4.利用netcat模拟生产数据,流式处理

linux端:nc -lk 7777    l代表listen监听,k表示keep保持连接

接着就可以在linux端持续发送文本数据

package com.ucas.wc;

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;

/**
 * @author GONG
 * @version 1.0
 * @date 2020/12/21 16:25
 * 流处理wordCount2
 */
public class WordCount3 {
    public static void main(String[] args) throws Exception {
        //创建流处理执行环境
        StreamExecutionEnvironment executionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment();
        // executionEnvironment.setParallelism(4);//设置并行度,默认是电脑核心数
        //parameterTool工具从程序启动参数中提取配置项
        ParameterTool parameterTool = ParameterTool.fromArgs(args);
        String hostName = parameterTool.get("host");
        int port = parameterTool.getInt("port");

        // kafka里面的数据往往是源源不断的,流式的
        // 我们使用netcat nc小工具模拟这个功能,可以在某个端口持续发东西 nc -lk 7777
        // 从socket文本流读取数据
        DataStreamSource<String> stringDataStreamSource = executionEnvironment.socketTextStream(hostName, port);

        //基于DataStream转换操作
        SingleOutputStreamOperator<Tuple2<String, Integer>> tuple2SingleOutputStreamOperator = stringDataStreamSource.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
            public void flatMap(String line, Collector<Tuple2<String, Integer>> out) throws Exception {
                String[] words = line.split(" ");
                for (String word : words) {
                    Tuple2<String, Integer> stringIntegerTuple2 = new Tuple2<String, Integer>(word, 1);
                    out.collect(stringIntegerTuple2);
                }
            }
        });

        KeyedStream<Tuple2<String, Integer>, Tuple> tuple2TupleKeyedStream = tuple2SingleOutputStreamOperator.keyBy(0);
        SingleOutputStreamOperator<Tuple2<String, Integer>> sum = tuple2TupleKeyedStream.sum(1);
        sum.print();

        //执行任务
        executionEnvironment.execute();
    }
}

我们在代码中将host,port参数提出来了,目的是更好的用到生产环境当中,在部署代码的时候,我们可以利用--host localhost --port 7777。IDEA中也可以模拟参数:

  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值