大数据学习之Flink——06Data Source

Flink 做为一款流式计算框架,它可用来做批处理,即处理静态的数据集、历史的数据集;也可以用来做流处理,即实时的处理些实时数据流,实时的产生数据流结果,只要数据源源不断的过来,Flink 就能够一直计算下去,这个 Data Sources 就是数据的来源地。

一. 基于集合

1. fromCollection(Collection)

从 Collection 创建数据流。集合中的所有元素类型必须相同。

  1. scala代码

    package com.hjf.dataSource
    
    import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
    object DataSourceForCollection {
         
      def main(args: Array[String]): Unit = {
         
          val streamEnv: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
          streamEnv.setParallelism(1)
          import org.apache.flink.streaming.api.scala._
          val stream: DataStream[String] = streamEnv.fromCollection(Array(
            "张三", "李四", "张三", "王五"
          ))
          stream.map((_, 1)).keyBy(0).sum(1).print()
          streamEnv.execute()
        }
    }
    
  2. java代码

    package com.hjf.source;
    
    import org.apache.flink.api.common.functions.FlatMapFunction;
    import org.apache.flink.api.java.tuple.Tuple2;
    import org.apache.flink.streaming.api.datastream.DataStreamSource;
    import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
    import org.apache.flink.util.Collector;
    import java.util.Arrays;
    public class DataSourceForCollection {
         
        public static void main(String[] args) throws Exception {
         
            StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment();
            streamEnv.setParallelism(1);
            DataStreamSource<String> stream = streamEnv.fromCollection(Arrays.asList("张三", "李四", "张三", "王五"));
            stream.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
         
                @Override
                public void flatMap(String s, Collector<Tuple2<String, Integer>> out) throws Exception {
         
                    String[] arr = s.split(" ");
                    for (String ele: arr) {
         
                        if (ele.length() > 0) {
         
                            out.collect(new Tuple2<String, Integer>(ele, 1));
                        }
                    }
                }
            }).keyBy(0).sum(1).print();
            streamEnv.execute("collect");
        }
    }
    
    
  3. 运行结果
    在这里插入图片描述

2. fromElements(T …)

从给定的对象序列中创建数据流。所有对象类型必须相同。

  1. scala代码

    package com.hjf.dataSource
    
    import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
    object DataSourceForCollection {
         
      def main(args: Array[String]): Unit = {
         
          val streamEnv: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
          streamEnv.setParallelism(1)
          import org.apache.flink.streaming.api.scala._
          val stream: DataStream[String] = streamEnv.fromElements("hello", "word", "spark", "word")
          stream.map((_, 1)).keyBy(0).sum(1).print()
          streamEnv.execute()
        }
    }
    
  2. java代码

    package com.hjf.source;
    
    import org.apache.flink.api.common.functions.FlatMapFunction;
    import org.apache.flink.api.java.tuple.Tuple2;
    import org.apache.flink.streaming.api.datastream.DataStreamSource;
    import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
    import org.apache.flink.util.Collector;
    
    public class DataSourceForElement {
         
        public static void main(String[] args) throws Exception {
         
            StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment();
            streamEnv.setParallelism(1);
            DataStreamSource<String> stream = streamEnv.fromElements("张三", "李四", "张三", "王五");
            stream.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
         
                @Override
                public void flatMap(String s, Collector<Tuple2<String, Integer>> out) throws Exception {
         
                    String[] arr = s.split(" ");
                    for (String ele: arr) {
         
                        if (ele.length() > 0) {
         
                            out.collect(new Tuple2<String, Integer>(ele, 1));
                        }
                    }
                }
            }).keyBy(0).sum(1).print();
            streamEnv.execute("collect");
        }
    }
    
    
  3. 运行结果:
    在这里插入图片描述

二. 基于文件

1. 本地文件
  1. scala代码

    package com.hjf.dataSource
    
    import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
    object DataSourceForFile {
         
      def main(args: Array[String]): Unit = {
         
        val streamEnv: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
        streamEnv.setParallelism(1)
        import org.apache.flink.streaming.api.scala._
        val inputPath: String = getClass.getResource("/data.txt").getPath
        // 如果这里面直接填路径, 则需要填写绝对路径
        val stream: DataStream[String] = streamEnv.readTextFile(inputPath)
        stream.flatMap(_.split(" ")).map((_, 1)).keyBy(0).sum(1).print(
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值