目录
1.pom依赖
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-java -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>1.10.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-streaming-java -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.11</artifactId>
<version>1.10.1</version>
<!-- <scope>provided</scope>-->
</dependency>
</dependencies>
2.批处理wordCount
package com.ucas.wc;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.AggregateOperator;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.operators.FlatMapOperator;
import org.apache.flink.api.java.operators.UnsortedGrouping;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
/**
* @author GONG
* @version 1.0
* @date 2020/12/21 16:25
* 批处理wordCount
*/
public class WordCount {
public static void main(String[] args) throws Exception {
//创建执行环境
ExecutionEnvironment executionEnvironment = ExecutionEnvironment.getExecutionEnvironment();
//从文件中读取数据
String inputPath = "D:\\IDEA_WORK\\flinkstudy\\src\\main\\resources\\hello.txt";
//数据源:本质是继承了dataSet
DataSource<String> stringDataSource = executionEnvironment.readTextFile(inputPath);
DataSet<String> inputDataSet = stringDataSource;
//对数据集进行处理(按行处理,按照空格分词,转二元组)(word,1)
FlatMapOperator<String, Tuple2<String, Integer>> stringTuple2FlatMapOperator = inputDataSet.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
public void flatMap(String line, Collector<Tuple2<String, Integer>> out) throws Exception {
//按照空格进行分词
String[] words = line.split(" ");
for (String word : words) {
// 输出数据
out.collect(new Tuple2<String, Integer>(word, 1));
}
}
});
//根据word分组groupBy,按照第一个位置的word进行分组
UnsortedGrouping<Tuple2<String, Integer>> tuple2UnsortedGrouping = stringTuple2FlatMapOperator.groupBy(0);
//按照第二个位置求和
AggregateOperator<Tuple2<String, Integer>> sum = tuple2UnsortedGrouping.sum(1);
//打印输出
sum.print();
}
}
3.流处理wordCount2(读文本还是批处理)
package com.ucas.wc;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
/**
* @author GONG
* @version 1.0
* @date 2020/12/21 16:25
* 流处理wordCount2
*/
public class WordCount2 {
public static void main(String[] args) throws Exception {
//创建流处理执行环境
StreamExecutionEnvironment executionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment();
executionEnvironment.setParallelism(4);//设置并行度,默认是电脑核心数
//从文件中读取数据
String inputPath = "D:\\IDEA_WORK\\flinkstudy\\src\\main\\resources\\hello.txt";
//数据源:DataStreamSource流式算子,本质是继承了dataSet
DataStreamSource<String> stringDataStreamSource = executionEnvironment.readTextFile(inputPath);
DataStream<String> inputDataStream = stringDataStreamSource;
//基于DataStream转换操作
SingleOutputStreamOperator<Tuple2<String, Integer>> tuple2SingleOutputStreamOperator = inputDataStream.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
public void flatMap(String line, Collector<Tuple2<String, Integer>> out) throws Exception {
String[] words = line.split(" ");
for (String word : words) {
Tuple2<String, Integer> stringIntegerTuple2 = new Tuple2<String, Integer>(word, 1);
out.collect(stringIntegerTuple2);
}
}
});
KeyedStream<Tuple2<String, Integer>, Tuple> tuple2TupleKeyedStream = tuple2SingleOutputStreamOperator.keyBy(0);
SingleOutputStreamOperator<Tuple2<String, Integer>> sum = tuple2TupleKeyedStream.sum(1);
sum.print();
//执行任务
executionEnvironment.execute();
}
}
上面输出数据的最左侧代表并行度,flink默认的并行度是4。
4.利用netcat模拟生产数据,流式处理
linux端:nc -lk 7777 l代表listen监听,k表示keep保持连接
接着就可以在linux端持续发送文本数据
package com.ucas.wc;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
/**
* @author GONG
* @version 1.0
* @date 2020/12/21 16:25
* 流处理wordCount2
*/
public class WordCount3 {
public static void main(String[] args) throws Exception {
//创建流处理执行环境
StreamExecutionEnvironment executionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment();
// executionEnvironment.setParallelism(4);//设置并行度,默认是电脑核心数
//parameterTool工具从程序启动参数中提取配置项
ParameterTool parameterTool = ParameterTool.fromArgs(args);
String hostName = parameterTool.get("host");
int port = parameterTool.getInt("port");
// kafka里面的数据往往是源源不断的,流式的
// 我们使用netcat nc小工具模拟这个功能,可以在某个端口持续发东西 nc -lk 7777
// 从socket文本流读取数据
DataStreamSource<String> stringDataStreamSource = executionEnvironment.socketTextStream(hostName, port);
//基于DataStream转换操作
SingleOutputStreamOperator<Tuple2<String, Integer>> tuple2SingleOutputStreamOperator = stringDataStreamSource.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
public void flatMap(String line, Collector<Tuple2<String, Integer>> out) throws Exception {
String[] words = line.split(" ");
for (String word : words) {
Tuple2<String, Integer> stringIntegerTuple2 = new Tuple2<String, Integer>(word, 1);
out.collect(stringIntegerTuple2);
}
}
});
KeyedStream<Tuple2<String, Integer>, Tuple> tuple2TupleKeyedStream = tuple2SingleOutputStreamOperator.keyBy(0);
SingleOutputStreamOperator<Tuple2<String, Integer>> sum = tuple2TupleKeyedStream.sum(1);
sum.print();
//执行任务
executionEnvironment.execute();
}
}
我们在代码中将host,port参数提出来了,目的是更好的用到生产环境当中,在部署代码的时候,我们可以利用--host localhost --port 7777。IDEA中也可以模拟参数: