flink 本地部署
wget https://mirrors.bfsu.edu.cn/apache/flink/flink-1.11.2/flink-1.11.2-bin-scala_2.11.tgz
[hadoop@slave1 soft]$ tar -zxvf flink-1.11.2-bin-scala_2.11.tgz
[hadoop@slave1 soft]$ cd flink-1.11.2
[hadoop@slave1 flink-1.11.2]$ pwd
/home/hadoop/soft/flink-1.11.2
[hadoop@slave1 flink-1.11.2]$ ls -l
total 580
drwxr-xr-x. 2 hadoop hadoop 4096 Sep 10 11:48 bin
drwxr-xr-x. 2 hadoop hadoop 4096 Sep 10 11:48 conf
drwxr-xr-x. 7 hadoop hadoop 76 Sep 10 11:48 examples
drwxr-xr-x. 2 hadoop hadoop 4096 Sep 10 11:48 lib
-rw-r--r--. 1 hadoop hadoop 11357 Aug 15 11:09 LICENSE
drwxr-xr-x. 2 hadoop hadoop 4096 Sep 10 11:48 licenses
drwxr-xr-x. 2 hadoop hadoop 6 Aug 16 16:01 log
-rw-r--r--. 1 hadoop hadoop 555312 Sep 10 11:48 NOTICE
drwxr-xr-x. 3 hadoop hadoop 4096 Sep 10 11:48 opt
drwxr-xr-x. 10 hadoop hadoop 210 Sep 10 11:48 plugins
-rw-r--r--. 1 hadoop hadoop 1309 Aug 15 11:09 README.txt
运行
[hadoop@slave1 flink-1.11.2]$ ./bin/start-cluster.sh
Starting cluster.
Starting standalonesession daemon on host slave1.
Starting taskexecutor daemon on host slave1.
打开浏览器
http://192.168.153.134:8081/#/overview
demo
package com.myflink;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;
public class SocketWindowWordCount {
public static void main(String[] args) throws Exception {
// 创建 execution environment
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStream<Tuple2<String, Integer>> dataStream = env
.socketTextStream("192.168.153.134", 9999)
.flatMap(new Splitter())
.keyBy(value -> value.f0)
.timeWindow(Time.seconds(5))
.sum(1);
dataStream.print();
env.execute("SocketWindowWordCount");
}
public static class Splitter implements FlatMapFunction<String, Tuple2<String, Integer>> {
@Override
public void flatMap(String sentence, Collector<Tuple2<String, Integer>> out) throws Exception {
for (String word: sentence.split(" ")) {
out.collect(new Tuple2<String, Integer>(word, 1));
}
}
}
}
报错信息
SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
Exception in thread "main" java.lang.IllegalStateException: No ExecutorFactory found to execute the application.
at org.apache.flink.core.execution.DefaultExecutorServiceLoader.getExecutorFactory(DefaultExecutorServiceLoader.java:84)
at org.apache.flink.streaming.api.environment.StreamExecutionEnvironment.executeAsync(StreamExecutionEnvironment.java:1801)
at org.apache.flink.streaming.api.environment.StreamExecutionEnvironment.execute(StreamExecutionEnvironment.java:1711)
at org.apache.flink.streaming.api.environment.LocalStreamEnvironment.execute(LocalStreamEnvironment.java:74)
at org.apache.flink.streaming.api.environment.StreamExecutionEnvironment.execute(StreamExecutionEnvironment.java:1697)
at com.myflink.SocketWindowWordCount.main(SocketWindowWordCount.java:23)
解决方法:
添加jar包
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-clients -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_2.12</artifactId>
<version>1.11.2</version>
</dependency>
运行
[root@slave1 flink-1.11.2]# nc -lk 9999
8923402
23jekjsef
3094020842
dfadfa
tryeyetr jhttyd
trtrtrtrt
trtrtrtrt
trtrtrtrt
rtrtrtrtt
rtrtrtrtr
rtrtrtrtr
rtrtrtrt
运行结果
2> (23jekjsef,1)
4> (3094020842,1)
4> (jhttyd,1)
4> (tryeyetr,1)
5> (8923402,1)
5> (dfadfa,1)
8> (trtrtrtrt,2)
8> (trtrtrtrt,1)
3> (rtrtrtrtr,1)
6> (rtrtrtrtt,1)
3> (rtrtrtrtr,1)
7> (rtrtrtrt,1)
DataStream API
1.运行环境StreamExecutionEnvironment
StreamExecutionEnvironment是个抽象类,是流式处理的容器,实现类有两个,分别是
LocalStreamEnvironment:
RemoteStreamEnvironment:
**
* The StreamExecutionEnvironment is the context in which a streaming program is executed. A
* {@link LocalStreamEnvironment} will cause execution in the current JVM, a
* {@link RemoteStreamEnvironment} will cause execution on a remote setup.
*
* <p>The environment provides methods to control the job execution (such as setting the parallelism
* or the fault tolerance/checkpointing parameters) and to interact with the outside world (data access).
*
* @see org.apache.flink.streaming.api.environment.LocalStreamEnvironment
* @see org.apache.flink.streaming.api.environment.RemoteStreamEnvironment
*/
2.数据源DataSource数据输入
包含了输入格式InputFormat
/**
* Creates a new data source.
*
* @param context The environment in which the data source gets executed.
* @param inputFormat The input format that the data source executes.
* @param type The type of the elements produced by this input format.
*/
public DataSource(ExecutionEnvironment context, InputFormat<OUT, ?> inputFormat, TypeInformation<OUT> type, String dataSourceLocationName) {
super(context, type);
this.dataSourceLocationName = dataSourceLocationName;
if (inputFormat == null) {
throw new IllegalArgumentException("The input format may not be null.");
}
this.inputFormat = inputFormat;
if (inputFormat instanceof NonParallelInput) {
this.parallelism = 1;
}
}
3.DataStream转换
DataStream:同一个类型的流元素,DataStream可以通过transformation转换成另外的DataStream
StreamOperator:流式算子的基本接口,三个实现类
AbstractStreamOperator:
OneInputStreamOperator:
TwoInputStreamOperator:
/**
* Basic interface for stream operators. Implementers would implement one of
* {@link org.apache.flink.streaming.api.operators.OneInputStreamOperator} or
* {@link org.apache.flink.streaming.api.operators.TwoInputStreamOperator} to create operators
* that process elements.
*
* <p>The class {@link org.apache.flink.streaming.api.operators.AbstractStreamOperator}
* offers default implementation for the lifecycle and properties methods.
*
* <p>Methods of {@code StreamOperator} are guaranteed not to be called concurrently. Also, if using
* the timer service, timer callbacks are also guaranteed not to be called concurrently with
* methods on {@code StreamOperator}.
*
* @param <OUT> The output type of the operator
*/
DataStreamSink输出
/**
* Adds the given sink to this DataStream. Only streams with sinks added
* will be executed once the {@link StreamExecutionEnvironment#execute()}
* method is called.
*
* @param sinkFunction
* The object containing the sink's invoke function.
* @return The closed DataStream.
*/
public DataStreamSink<T> addSink(SinkFunction<T> sinkFunction) {
// read the output type of the input Transform to coax out errors about MissingTypeInfo
transformation.getOutputType();
// configure the type if needed
if (sinkFunction instanceof InputTypeConfigurable) {
((InputTypeConfigurable) sinkFunction).setInputType(getType(), getExecutionConfig());
}
StreamSink<T> sinkOperator = new StreamSink<>(clean(sinkFunction));
DataStreamSink<T> sink = new DataStreamSink<>(this, sinkOperator);
getExecutionEnvironment().addOperator(sink.getTransformation());
return sink;
}
5.执行
/**
* Executes the JobGraph of the on a mini cluster of ClusterUtil with a user
* specified name.
*
* @param jobName
* name of the job
* @return The result of the job execution, containing elapsed time and accumulators.
*/
@Override
public JobExecutionResult execute(String jobName) throws Exception {
// transform the streaming program into a JobGraph
StreamGraph streamGraph = getStreamGraph();
streamGraph.setJobName(jobName);
JobGraph jobGraph = streamGraph.getJobGraph();
jobGraph.setAllowQueuedScheduling(true);
Configuration configuration = new Configuration();
configuration.addAll(jobGraph.getJobConfiguration());
configuration.setString(TaskManagerOptions.MANAGED_MEMORY_SIZE, "0");
// add (and override) the settings with what the user defined
configuration.addAll(this.configuration);
if (!configuration.contains(RestOptions.BIND_PORT)) {
configuration.setString(RestOptions.BIND_PORT, "0");
}
int numSlotsPerTaskManager = configuration.getInteger(TaskManagerOptions.NUM_TASK_SLOTS, jobGraph.getMaximumParallelism());
MiniClusterConfiguration cfg = new MiniClusterConfiguration.Builder()
.setConfiguration(configuration)
.setNumSlotsPerTaskManager(numSlotsPerTaskManager)
.build();
if (LOG.isInfoEnabled()) {
LOG.info("Running job on local embedded Flink mini cluster");
}
MiniCluster miniCluster = new MiniCluster(cfg);
try {
miniCluster.start();
configuration.setInteger(RestOptions.PORT, miniCluster.getRestAddress().get().getPort());
return miniCluster.executeJobBlocking(jobGraph);
}
finally {
transformations.clear();
miniCluster.close();
}
}