Filter算子:过滤作用
filter算子过滤函数
过滤函数,过滤出需要的数据,对传入的数据进行判断,如果返回true则该元素继续向下传递,如果返回false则该元素将被过滤掉。比如:如果返回来的价格大于100,我就打印出来,小于100就不打印出来
package Flink_API;
import com.alibaba.fastjson.JSON;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010;
import org.apache.flink.streaming.util.serialization.SimpleStringSchema;
import org.apache.flink.util.Collector;
import java.io.Serializable;
import java.util.Properties;
public class TestFileter {
public static void main(String[] rags) throws Exception {
//获取Flink的运行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
env.setParallelism(1);
//读取用户浏览信息
Properties consumerProperties=new Properties();
consumerProperties.setProperty("bootstrap.servers","page01");
consumerProperties.setProperty("groud.id","browsegroup");
DataStreamSource<String> dataStreamSource=env.addSource(new FlinkKafkaConsumer010<String>("topic",new SimpleStringSchema(),consumerProperties));
//解析数据
DataStream<UserBrowseLog> processData=dataStreamSource.process(new ProcessFunction<String, UserBrowseLog>() {
@Override
public void processElement(String s, Context context, Collector<UserBrowseLog> collector) throws Exception {
try{
UserBrowseLog browseLog= JSON.parseObject(s, UserBrowseLog.class);
if(browseLog!=null){
collector.collect(browseLog);
}
}catch(Exception e){
System.out.print("解析Json_UserBrowseLog异常,异常信息是:"+e.getMessage());
}
}
}).setParallelism(2);
DataStream<UserBrowseLog> filter = processData.filter(new FilterFunction<UserBrowseLog>() {
@Override
public boolean filter(UserBrowseLog userBrowseLog) throws Exception {
if(userBrowseLog.getProductPrice()>100){
return true;
}else{
return false;
}
}
});
filter.print();
env.execute("TestFileter");
}
public static class UserBrowseLog implements Serializable {
private String userID;
private String eventTime;
private String eventType;
private String productID;
private Integer productPrice;
public String getUserID() {
return userID;
}
public void setUserID(String userID) {
this.userID = userID;
}
public String getEventTime() {
return eventTime;
}
public void setEventTime(String eventTime) {
this.eventTime = eventTime;
}
public String getEventType() {
return eventType;
}
public void setEventType(String eventType) {
this.eventType = eventType;
}
public String getProductID() {
return productID;
}
public void setProductID(String productID) {
this.productID = productID;
}
public Integer getProductPrice() {
return productPrice;
}
public void setProductPrice(Integer productPrice) {
this.productPrice = productPrice;
}
@Override
public String toString() {
return "UserBrowseLog{" +
"userID='" + userID + '\'' +
", eventTime='" + eventTime + '\'' +
", eventType='" + eventType + '\'' +
", productID='" + productID + '\'' +
", productPrice=" + productPrice +
'}';
}
}
}
KeyBy算子:keyBY的使用:比如我们要从卡夫卡统计每个用户对商品的浏览信息,如何对用户进行分开统计,Flink里面不支持,mapRreduce是支持这样统计的。因为卡夫卡里面的数据是混乱的,做法是先统计出每个用户浏览的商品,然后再把相同的加起来统计。
同一个用户的数据发送到同一个任务里面进行处理,mapreduce可以,Flink不会。
1、keyby算子仅仅是用来进行区分,所以其后面不能跟setParallesism参数,可以理解为非真真的算子;
2、区分结果和keyBY下游算子的并行度强相关,如下游算子只有一个并行度,不管咋么分,都会分到一起;
3、对POJO类型,keyby可以通过keyBy(fieldName)制定对象当中某个字段进行分区
4、对于tuple类型,keyBy可以通过keyBy(fieldPosition)制定Tuple中的算几个元素进行分区;
5、对于一般类型,keyBy可以通过keyBy(new keySelector{...})指定字段进行分区。
注意:一下类型是无法作为key的
1、一个实体对象,没有重写hashCode方法,而是依赖object的hashCode方法2、数组类型3、基本数据类型,int、long
//按照单词进行分组
.keyBy(...fields:"word" KeydStream<SocketWindowWordCountJava.WordWithCount,Tuple>)
package Flink_API;
import com.alibaba.fastjson.JSON;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010;
import org.apache.flink.streaming.util.serialization.SimpleStringSchema;
import org.apache.flink.util.Collector;
import java.io.Serializable;
import java.util.Properties;
public class TestKeyBy {
//查看一个用户一分钟内点击了多少下
public static void main(String[] rags) throws Exception {
//获取Flink的运行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
env.setParallelism(1);
//读取用户浏览信息
Properties consumerProperties=new Properties();
consumerProperties.setProperty("bootstrap.servers","page01");
consumerProperties.setProperty("groud.id","browsegroup");
DataStreamSource<String> dataStreamSource=env.addSource(new FlinkKafkaConsumer010<String>("topic",new SimpleStringSchema(),consumerProperties));
//解析数据
DataStream<UserBrowseLog> processData=dataStreamSource.process(new ProcessFunction<String, UserBrowseLog>() {
@Override
public void processElement(String s, Context context, Collector<UserBrowseLog> collector) throws Exception {
try{
UserBrowseLog browseLog= JSON.parseObject(s,UserBrowseLog.class);
if(browseLog!=null){
collector.collect(browseLog);
}
}catch(Exception e){
System.out.print("解析Json_UserBrowseLog异常,异常信息是:"+e.getMessage());
}
}
}).setParallelism(2);
KeyedStream<UserBrowseLog,String> keyBy=processData.keyBy(new KeySelector<UserBrowseLog, String>() {
@Override
public String getKey(UserBrowseLog userBrowseLog) throws Exception {
return userBrowseLog.getUserID();
}
});
//来一条数据计算一次
keyBy.print();
env.execute("TestKeyBy");
}
public static class UserBrowseLog implements Serializable {
private String userID;
private String eventTime;
private String eventType;
private String productID;
private Integer productPrice;
public String getUserID() {
return userID;
}
public void setUserID(String userID) {
this.userID = userID;
}
public String getEventTime() {
return eventTime;
}
public void setEventTime(String eventTime) {
this.eventTime = eventTime;
}
public String getEventType() {
return eventType;
}
public void setEventType(String eventType) {
this.eventType = eventType;
}
public String getProductID() {
return productID;
}
public void setProductID(String productID) {
this.productID = productID;
}
public Integer getProductPrice() {
return productPrice;
}
public void setProductPrice(Integer productPrice) {
this.productPrice = productPrice;
}
@Override
public String toString() {
return "UserBrowseLog{" +
"userID='" + userID + '\'' +
", eventTime='" + eventTime + '\'' +
", eventType='" + eventType + '\'' +
", productID='" + productID + '\'' +
", productPrice=" + productPrice +
'}';
}
}
}
Reduce算子:
对key相同的记录进行滚动聚合操作,也就是将当前元素和上一次reduce滚动聚合的结果进行再次聚合,然后返回一个新的值,并向下游算子输出每次滚动聚合后的结果
案例:滚动聚合每个用户浏览商品的价格
package Flink_API;
import com.alibaba.fastjson.JSON;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010;
import org.apache.flink.streaming.util.serialization.SimpleStringSchema;
import org.apache.flink.util.Collector;
import java.io.Serializable;
import java.util.Properties;
public class TestReduce {
public static void main(String[] rags) throws Exception {
//获取Flink的运行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
env.setParallelism(1);
//读取用户浏览信息
Properties consumerProperties=new Properties();
consumerProperties.setProperty("bootstrap.servers","page01");
consumerProperties.setProperty("groud.id","browsegroup");
DataStreamSource<String> dataStreamSource=env.addSource(new FlinkKafkaConsumer010<String>("topic",new SimpleStringSchema(),consumerProperties));
//解析数据
DataStream<UserBrowseLog> processData=dataStreamSource.process(new ProcessFunction<String, UserBrowseLog>() {
@Override
public void processElement(String s, Context context, Collector<UserBrowseLog> collector) throws Exception {
try{
UserBrowseLog browseLog= JSON.parseObject(s,UserBrowseLog.class);
if(browseLog!=null){
collector.collect(browseLog);
}
}catch(Exception e){
System.out.print("解析Json_UserBrowseLog异常,异常信息是:"+e.getMessage());
}
}
}).setParallelism(2);
DataStream<UserBrowseLog> reduceMap=processData.keyBy("userID").reduce(new ReduceFunction<UserBrowseLog>() {
@Override
public UserBrowseLog reduce(UserBrowseLog t1, UserBrowseLog t2) throws Exception {
int i = t1.getProductPrice() + t2.getProductPrice();
return new UserBrowseLog(t1.getUserID(),"","","",i);
}
});
//来一条计算一次,只有再窗口结束的时候才会输出
reduceMap.print();
env.execute("TestReduce");
}
public static class UserBrowseLog implements Serializable {
private String userID;
private String eventTime;
private String eventType;
private String productID;
private Integer productPrice;
public UserBrowseLog(String userID, String eventTime, String eventType, String productID, Integer productPrice) {
this.userID = userID;
this.eventTime = eventTime;
this.eventType = eventType;
this.productID = productID;
this.productPrice = productPrice;
}
public String getUserID() {
return userID;
}
public void setUserID(String userID) {
this.userID = userID;
}
public String getEventTime() {
return eventTime;
}
public void setEventTime(String eventTime) {
this.eventTime = eventTime;
}
public String getEventType() {
return eventType;
}
public void setEventType(String eventType) {
this.eventType = eventType;
}
public String getProductID() {
return productID;
}
public void setProductID(String productID) {
this.productID = productID;
}
public Integer getProductPrice() {
return productPrice;
}
public void setProductPrice(Integer productPrice) {
this.productPrice = productPrice;
}
@Override
public String toString() {
return "UserBrowseLog{" +
"userID='" + userID + '\'' +
", eventTime='" + eventTime + '\'' +
", eventType='" + eventType + '\'' +
", productID='" + productID + '\'' +
", productPrice=" + productPrice +
'}';
}
}
}
Aggregate算子:
对key相同的记录按照指定的字段(field)进行滚动聚合操作,也就是将当前元素和上一次滚动聚合的结果进行再次聚合,然后返回一个新的值,并向下游算子输出每次滚动聚合后的结果。min和minBy的区别是:min返回指定字段的最小值,并将该值赋值给第一条数据并返回第一条数据,而minBY返回最小值所在的那条原生记录,其余同理。
//对keyStream中元素的第一个Filed求和
DataStream<String> dataStream=keyStream.sum(0);
//对keyedStream中元素的count字段求和
keyedStream.sum("count");
//获取keyedStream中第一个字段的最小值
keyedStream.min(0);
//获取keyedStream中couny字段的最小值元素
keyedStream.minBy("count");
keyedStream.max("count");
keyedStream.maxBy(0);
package Flink_API;
import com.alibaba.fastjson.JSON;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010;
import org.apache.flink.streaming.util.serialization.SimpleStringSchema;
import org.apache.flink.util.Collector;
import java.io.Serializable;
import java.util.Properties;
public class TestAggregate {
public static void main(String[] rags) throws Exception {
//获取Flink的运行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
env.setParallelism(1);
//读取用户浏览信息
Properties consumerProperties=new Properties();
consumerProperties.setProperty("bootstrap.servers","page01");
consumerProperties.setProperty("groud.id","browsegroup");
DataStreamSource<String> dataStreamSource=env.addSource(new FlinkKafkaConsumer010<String>("topic",new SimpleStringSchema(),consumerProperties));
//解析数据
DataStream<UserBrowseLog> processData=dataStreamSource.process(new ProcessFunction<String, UserBrowseLog>() {
@Override
public void processElement(String s, Context context, Collector<UserBrowseLog> collector) throws Exception {
try{
UserBrowseLog browseLog= JSON.parseObject(s,UserBrowseLog.class);
if(browseLog!=null){
collector.collect(browseLog);
}
}catch(Exception e){
System.out.print("解析Json_UserBrowseLog异常,异常信息是:"+e.getMessage());
}
}
}).setParallelism(2);
//不加windows(统计价格流量)
DataStream<UserBrowseLog> minByData=processData.keyBy("userID").maxBy("productPrice");
minByData.print();
env.execute("TestAggregate");
}
public static class UserBrowseLog implements Serializable {
private String userID;
private String eventTime;
private String eventType;
private String productID;
private Integer productPrice;
public String getUserID() {
return userID;
}
public void setUserID(String userID) {
this.userID = userID;
}
public String getEventTime() {
return eventTime;
}
public void setEventTime(String eventTime) {
this.eventTime = eventTime;
}
public String getEventType() {
return eventType;
}
public void setEventType(String eventType) {
this.eventType = eventType;
}
public String getProductID() {
return productID;
}
public void setProductID(String productID) {
this.productID = productID;
}
public Integer getProductPrice() {
return productPrice;
}
public void setProductPrice(Integer productPrice) {
this.productPrice = productPrice;
}
@Override
public String toString() {
return "UserBrowseLog{" +
"userID='" + userID + '\'' +
", eventTime='" + eventTime + '\'' +
", eventType='" + eventType + '\'' +
", productID='" + productID + '\'' +
", productPrice=" + productPrice +
'}';
}
}
}