package com.sandra.day03;
import com.atguigu.bean.WaterSensor;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
//模拟上游两个分区数据通过keyBy向下游四个分区哪个分区发送
public class Flink04_Transform_KeyBy {
public static void main(String[] args) throws Exception {
//todo 1.获取流的执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(4);
//todo 2.从端口读取数据
DataStreamSource<String> streamSource = env.socketTextStream("hadoop102", 9999);
//todo 3.使用map将读取的数据转为WaterSensor
SingleOutputStreamOperator<WaterSensor> result = streamSource.map(new MapFunction<String, WaterSensor>() {
@Override
public WaterSensor map(String line) throws Exception {
String[] waterSensorWords = line.split(" ");
return new WaterSensor(waterSensorWords[0], Long.parseLong(waterSensorWords[1]), Integer.parseInt(waterSensorWords[2]));
}
}).setParallelism(2);
//将上游数据打印,看看数据去往上游数据的哪个分区(上游有2个分区)
result.print("原始数据").setParallelism(2);//map和print的并行度都设置为2,目的是将它们串到一起
//todo 4.使用keyby将相同的id的数据放到一块
KeyedStream<WaterSensor, String> keyedStream = result.keyBy(new KeySelector<WaterSensor, String>() {
@Override
public String getKey(WaterSensor value) throws Exception {
return value.getId();
}
});
// KeyedStream<WaterSensor, Tuple> keyedStream1 = result.keyBy("id");
//将下游的数据打印出来,看看上游两个分区的数据分别去往下游的哪个分区
keyedStream.print("keyBy");//并行度为4
env.execute();
}
}
其中:
package com.sandra.bean;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@NoArgsConstructor
@AllArgsConstructor
@Data
public class WaterSensor {
private String id;
private Long ts;
private Integer vc;
}
总结:相同key的数据必然在同一个分区,一个分区中可能有多个不同的key,一个分组中的key必然是相同的,一个分区可能有多个不同的分组,分组是逻辑上的划分,分区是物理上的划分。