一:流处理
创建一个执行环境,表示当前执行程序的上下文。 如果程序是独立调用的,则
此方法返回本地执行环境;如果从命令行客户端调用程序以提交到集群,则此方法
返回此集群的执行环境,也就是说,getExecutionEnvironment 会根据查询运行的方
式决定返回什么样的运行环境,是最常用的一种创建执行环境的方式。
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
如果没有设置并行度,会以 flink-conf.yaml 中的配置为准,默认是 1。
1:创建本地执行环境createLocalEnvironment
返回本地执行环境,需要在调用时指定默认的并行度。
LocalStreamEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(1);
2:创建集群执行环境createRemoteEnvironment
返回集群执行环境,将 Jar 提交到远程服务器。需要在调用时指定 JobManager
的 IP 和端口号,并指定要在集群中运行的 Jar 包。
StreamExecutionEnvironment env =
StreamExecutionEnvironment.createRemoteEnvironment("jobmanage-hostname", 6123,
"YOURPATH//WordCount.jar");指定jobmanage地址,端口号,jar包位置
二:Source
1:从集合读取数据
package com.atguigu.source;
import com.atguigu.bean.SensorReading;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import java.util.ArrayList;
public class SourceTest1_Collection {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 1.Source:从集合读取数据
ArrayList<SensorReading> list = new ArrayList<>();
list.add(new SensorReading("sensor_1", 1547718199L, 35.8));
list.add(new SensorReading("sensor_6", 1547718201L, 15.4));
list.add(new SensorReading("sensor_7", 1547718202L, 6.7));
list.add(new SensorReading("sensor_10", 1547718205L, 38.1));
DataStream<SensorReading> sensorDataStream = env.fromCollection(list);
// 2.打印
sensorDataStream.print();
// 3.执行
env.execute();
}
}
2:从文件读取数据
package com.atguigu.wc;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.AggregateOperator;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
/**
* 流处理wordCount
*/
public class WordCount2 {
public static void main(String[] args) throws Exception {
//创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//设置并行度
// env.setParallelism(1);
//从文件中读取数据
String path = "D:\\大数据组API\\Flink\\Flink01\\src\\main\\resources\\hello.txt";
DataStreamSource<String> streamSource = env.readTextFile(path);
SingleOutputStreamOperator<Tuple2<String, Integer>> result = streamSource.flatMap((FlatMapFunction<String, Tuple2<String, Integer>>) (value, out) -> {
String[] words = value.split(" ");
for (String word : words) {
out.collect(new Tuple2<>(word, 1));
}
})
.keyBy(0)
.sum(1).disableChaining(); //disableChaining 不合并
result.print();
//执行任务
env.execute();
}
}
3:以 kafka 消息队列的数据作为来源
package com.atguigu.source;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011;
import java.util.Properties;
public class SourceTest2_Kafka {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// kafka 配置项
Properties properties = new Properties();
properties.setProperty("bootstrap.servers", "hadoop112:9092");
properties.setProperty("key.deserializer",
"org.apache.kafka.common.serialization.StringDeserializer");
properties.setProperty("value.deserializer",
"org.apache.kafka.common.serialization.StringDeserializer");
properties.setProperty("auto.offset.reset", "latest");
// 从 kafka 读取数据
DataStream<String> dataStream = env.addSource( new
FlinkKafkaConsumer011<String>("first", new SimpleStringSchema(), properties));
// 2.打印
dataStream.print();
// 3.执行
env.execute();
}
}
4:自定义source
package com.atguigu.source;
import com.atguigu.bean.SensorReading;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import java.util.HashMap;
import java.util.Random;
public class MySensor implements SourceFunction<SensorReading> {
private boolean running = true;
@Override
public void run(SourceContext<SensorReading> ctx) throws Exception {
Random random = new Random();
HashMap<String, Double> sensorTempMap = new HashMap<String, Double>();
for( int i = 0; i < 10; i++ ){
sensorTempMap.put("sensor_" + (i + 1), 60 + random.nextGaussian() * 20);
}
while (running) {
for( String sensorId: sensorTempMap.keySet() ){
Double newTemp = sensorTempMap.get(sensorId) + random.nextGaussian();
sensorTempMap.put(sensorId, newTemp);
ctx.collect( new SensorReading(sensorId, System.currentTimeMillis(),
newTemp));
}
Thread.sleep(1000L);
}
}
@Override
public void cancel() {
this.running = false;
}
}
package com.atguigu.source;
import com.atguigu.bean.SensorReading;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class SourceTest3_MySource {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<SensorReading> dataStream = env.addSource(new MySensor());
// 2.打印
dataStream.print();
// 3.执行
env.execute();
}
}