flink
1 基础上手
1.1 构建maven工程
1.1.1 引入依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>flink</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>1.10.1</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.12</artifactId>
<version>1.10.1</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.16</version>
<scope>compile</scope>
</dependency>
</dependencies>
</project>
1.2 批处理 wordcount
package com.zyz.wc;
import com.oracle.webservices.internal.api.databinding.DatabindingMode;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
//批处理
public class WordCount {
public static void main(String[] args) throws Exception {
//创建执行环境
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
//从文件中读取数据
String inputPath = "E:\\java.sql.jdk\\IDEA\\IdeaProjects\\Kingdom\\flink\\src\\main\\resources\\hello.txt";
DataSet<String> inputDataSet = env.readTextFile(inputPath);
//空格分词打散之后,对单词进行groupby分组, 然后用sum进行聚合
DataSet<Tuple2<String,Integer>> wordCountDataSet =
inputDataSet.flatMap(new MyFlatMapper())
.groupBy(0) // 传一个位置 0相当于第一个位置
.sum(1);//将第二个位置上的数据求和
//打印输出
wordCountDataSet.print();
}
//对数据进行处理,按空格分词展开,转换成(word,1) 二元组进行统计
// 传入string类型 得到二元组
public static class MyFlatMapper implements FlatMapFunction<String, Tuple2<String,
Integer>> {
public void flatMap(String value, Collector<Tuple2<String, Integer>> out) throws
Exception {
String[] words = value.split(" ");
for (String word : words) {
//遍历所有word,包成二元组输出
out.collect(new Tuple2<String, Integer>(word, 1));
}
}
}
}
1.3 流处理
package com.zyz.wc;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
public class StreamWordCount {
public static void main(String[] args) throws Exception {
//创建流处理执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);//设置并行度
//
//从文件中读取数据
// String inputPath = "hello.txt";
// DataStream<String> inputDataStream = env.readTextFile(inputPath);
//从socket文本读取数据
//用parameter tool 工具从程序启动参数中提取配置项
ParameterTool parameterTool = ParameterTool.fromArgs(args);
String host = parameterTool.get("host");
int port = parameterTool.getInt("port");
DataStream<String> inputDataStream = env.socketTextStream(host, port);
//基于数据流进行转换计算
DataStream<Tuple2<String, Integer>> wordCountDataStream = inputDataStream
.flatMap( new WordCount.MyFlatMapper())
.keyBy(0)
.sum(1);
wordCountDataStream.print();//设置并行度
//执行任务
env.execute();
}
}
执行前的准备工作
-
使用一台linux服务器输入以下命令:
nc -lk 7777
- idea run设置
传递参数:
- 启动 然后再linux服务器端输入 ,控制台显示:
1.4 在集群上运行:
1.4.1 打包上传
小tip,并行度设置的优先级别:
-
代码后:
-
环境变量后设置:
-
web-ui页面设置:
1.4.2 提交查看
提交完成后:
提交成功:
由上图可知,我们 本来是3个slot,然后我们的任务只设置了一个并行度,所以消耗一个
输出内容:
1.4.3 命令行提交
[root@node1 flink]# mkdir jar
[root@node1 flink]# ls
bin conf examples jar lib LICENSE licenses log NOTICE opt plugins README.txt tmp
[root@node1 flink]# bin/flink run -c com.zyz.wc.StreamWordCount -p 3 ./jar/flink-1.0-SNAPSHOT.jar --host 10.202.80.109 --port 7777
Job has been submitted with JobID 3893d1811f995f39e523b0ad8caa0bb0