1.简单DataSet API 案例
1.pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.example</groupId>
<artifactId>flink-tutorial</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>flink-tutorial</name>
<description>Demo project for Spring Boot</description>
<properties>
<java.version>1.8</java.version>
<flink-version>1.13.0</flink-version>
<scala.binary.version>2.12</scala.binary.version>
<slf4j.version>1.7.30</slf4j.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink-version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
<version>${flink-version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_${scala.binary.version}</artifactId>
<version>${flink-version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-to-slf4j</artifactId>
<version>2.14.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>
2.resources/log4j.properties
log4j.rootLogger = error,stdout
### 输出信息到控制抬 ###
log4j.appender.stdout = org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout = org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern = [%-5p] %d{yyyy-MM-dd HH:mm:ss,SSS} method:%l%n%m%n
3.words.txt
hello world
hello flink
hello java
- BatchWordCount .java
package com.example.flinktutorial.wordcount;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.AggregateOperator;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.operators.FlatMapOperator;
import org.apache.flink.api.java.operators.UnsortedGrouping;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
/**
* 这里采用DataSet api 进行了批处理 未来不用这种api
*
* Flink本身就是流批统一的处理框架,批处理本质也是流,所以从Flink1.12开始,采用DataStream API 进行流批统一处理
*
* bin/flink run -Dexecution.runtime-mode=BATCH BatchWordCount.jar
* 提交任务时,指定批处理模式
*/
public class BatchWordCount {
public static void main(String[] args) throws Exception {
//1.创建执行环境
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
//2.从文件中读取数据
DataSource<String> lineDataSource = env.readTextFile("input/words.txt");
//3.将每行数据进行分词,转换成二元组类型
FlatMapOperator<String, Tuple2<String,Long>> wordAndOneTuple = lineDataSource.flatMap((String line, Collector <Tuple2<String,Long>> out) ->{
String[] words = line.split(" ");
for(String word : words){
out.collect(Tuple2.of(word,1l));
}
}).returns(Types.TUPLE(Types.STRING,Types.LONG));
//4.按照word进行分组
UnsortedGrouping<Tuple2<String,Long>> wordAndOneGroup = wordAndOneTuple.groupBy(0);
//5.分组内进行聚合统计
AggregateOperator<Tuple2<String,Long>> sum = wordAndOneGroup.sum(1);
//6.打印结果
sum.print();
}
}
输出
(flink,1)
(world,1)
(hello,3)
(java,1)
这里采用DataSet api 进行了批处理 未来不用这种api
Flink本身就是流批统一的处理框架,批处理本质也是流,所以从Flink1.12开始,采用DataStream API 进行流批统一处理
bin/flink run -Dexecution.runtime-mode=BATCH BatchWordCount.jar
提交任务时,指定批处理模式
2、 DataStream API案例—有界流式处理
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
/**
* 有界的流式处理
*/
public class BoundedStreamWordCount {
public static void main(String[] args) throws Exception {
//1.创建流式的执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//2.读取文件
DataStreamSource<String> lineDataStreamSource = env.readTextFile("input/words.txt");
//3. 转换计算
SingleOutputStreamOperator<Tuple2<String,Long>> wordAndOneTuple = lineDataStreamSource.flatMap((String line, Collector<Tuple2<String,Long>> out)->{
String[] words = line.split(" ");
for(String word : words){
out.collect(Tuple2.of(word,1l));
}
}).returns(Types.TUPLE(Types.STRING,Types.LONG));
//4.按照word进行分组
KeyedStream<Tuple2<String,Long>,String> wordAndOneGroup = wordAndOneTuple.keyBy(data -> data.f0);
//5.分组内进行聚合统计
SingleOutputStreamOperator<Tuple2<String,Long>> sum = wordAndOneGroup.sum(1);
//6.打印
sum.print();
//7.启动执行
env.execute();//不断的监听,计算
}
}
idea采用了多线程的方式,模拟了Flink的分布式执行,前面的序号 就是线程号,就是并行子任务的编号
输出
3> (hello,1)
5> (world,1)
3> (hello,2)
2> (java,1)
3> (hello,3)
7> (flink,1)
并行的子任务能有多少个?----取决于并行度--默认就是运行电脑的当前的cpu核心数,例如我电脑是8核的,所以前面的序号是1~8之间
同一组的子任务 的处理线程是同一个
3、
在本机虚拟机node4节点上,安装
yum install nmap-ncat -y //centos
安装成功后
[root@node4 ~]# nc -lk 7777
启动java
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
public class StreamWordCount {
public static void main(String[] args) throws Exception {
//1. 创建流式执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//2.读取文本流
DataStreamSource<String> lineDataStream = env.socketTextStream("您的ip地址", 7777);
//3. 转换计算
SingleOutputStreamOperator<Tuple2<String,Long>> wordAndOneTuple = lineDataStream.flatMap((String line, Collector<Tuple2<String,Long>> out)->{
String[] words = line.split(" ");
for(String word : words){
out.collect(Tuple2.of(word,1l));
}
}).returns(Types.TUPLE(Types.STRING,Types.LONG));
//4.按照word进行分组
KeyedStream<Tuple2<String,Long>,String> wordAndOneGroup = wordAndOneTuple.keyBy(data -> data.f0);
//5.分组内进行聚合统计
SingleOutputStreamOperator<Tuple2<String,Long>> sum = wordAndOneGroup.sum(1);
//6.打印
sum.print();
//7.启动执行
env.execute();//不断的监听,计算
}
}
linux中输入
[root@node4 ~]# nc -lk 7777
hello word
日志输出
3> (hello,1)
6> (word,1)
优化java
//从参数中获取ip和端口号
ParameterTool parameterTool = ParameterTool.fromArgs(args);
String ipaddr = parameterTool.get("ipaddr");
Integer port = parameterTool.getInt("port");
//2.读取文本流
DataStreamSource<String> lineDataStream = env.socketTextStream(ipaddr, port);