Flink project java篇
pom.xml以及数据
[戳我👇](链接: https://pan.baidu.com/s/1ASPKIqxT4cM63Q0swZuqrg 密码: d58l)
attention
注意事项:1.keyBy中如果使用字符串,需要进行TypeInformation的转换
scala mistake summary 【summary most important!!!】
- jar包版本
- process之前没有window()相关的函数,因此process中定义的定时器不起作用[待进一步验证]
- 注意各种隐式转换的问题
- keyBy等的关键字,最好采用的是之前类中包含的,不然使用Tuple需要TypeInformation指定相应的类型
stage01 user behavior analysis
1.MyHotItemAnalysis.java
package com.wyj.myUserBehaviorAnalysis;
import lombok.Data;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.functions.RichAggregateFunction;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.api.java.aggregation.AggregationFunction;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.api.functions.timestamps.AscendingTimestampExtractor;
import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
public class MyHotItemAnalysis {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
DataStreamSource<String> in = env.readTextFile(
env.getClass().getResource("/UserBehavior.csv").getPath());
in
.map(data ->{
String[] fields = data.split(",");
return new MyUserBehavior(
fields[0],
fields[1],
fields[2],
fields[3],
Long.parseLong(fields[4])
);
})
.filter(data -> "pv".equals(data.getBehavior()))
.assignTimestampsAndWatermarks(new AscendingTimestampExtractor<MyUserBehavior>(){
@Override
public long extractAscendingTimestamp(MyUserBehavior myUserBehavior) {
return myUserBehavior.getTs() * 1000;
}
})
.keyBy(data -> data.getItemId())
.timeWindow(Time.hours(1) , Time.minutes(5))
.aggregate(new MyCountAgg() , new MyWindowFunc())
.keyBy(data -> data.getTs())
.process(new PrintHotTopItem(3))
.print();
env.execute("my hot item analysis (java) job");
}
}
class PrintHotTopItem extends ProcessFunction<MyHotItem , String> {
private int topSize ;
//因为只是按照窗口聚合了,而不是窗口,itemId二元组(这样的话会很麻烦)
private ListState<MyHotItem> listState;
public PrintHotTopItem(){
}
public PrintHotTopItem(int topSize) {
this.topSize = topSize;
}
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
listState = getRuntimeContext().getListState(
new ListStateDescriptor<MyHotItem>("hot item",MyHotItem.class)
);
}
@Override
public void processElement(MyHotItem myHotItem, Context context, Collector<String> collector) throws Exception {
listState.add(myHotItem);
//注册定时器,并给1秒处理@@@
context.timerService().registerEventTimeTimer(myHotItem.getTs() + 1);
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
super.onTimer(timestamp, ctx, out);
//从状态中取出值
Iterable<MyHotItem> myHotItems = listState.get();
ArrayList<MyHotItem> items = new ArrayList<>();
for (MyHotItem myHotItem : myHotItems) {
items.add(myHotItem);
}
listState.clear();
//排序@@@
items.sort((i , j) -> {
return (int) (j.getHotCount() - i.getHotCount());});
//输出
StringBuilder builder = new StringBuilder();
builder.append("=================\n")
.append("time : " + new SimpleDateFormat("MM/dd-HH:mm:ss").format((timestamp - 1)) + "\n");
for(int i = 0 ; i < topSize ; i++){
builder.append("第" + (i+1) + "名:\t")
.append(items.get(i).getItemId() + "\t")
.append(items.get(i).getHotCount() + "\n");
}
out.collect(builder.toString());
}
}
class MyWindowFunc implements WindowFunction<Long,MyHotItem,String,TimeWindow> {
@Override
public void apply(String s, TimeWindow window, java.lang.Iterable<Long> input, Collector<MyHotItem> out) {
out.collect(
new MyHotItem(
s,
window.getEnd(),
input.iterator().next()
)
);
}
}
class MyCountAgg implements AggregateFunction<MyUserBehavior, Long , Long> {
@Override
public Long createAccumulator() {
return 0L;
}
@Override
public Long add(MyUserBehavior myUserBehavior, Long aLong) {
return aLong + 1;
}
@Override
public Long getResult(Long aLong) {
return aLong;
}
@Override
public Long merge(Long aLong, Long acc1) {
return aLong + acc1;
}
}
@Data
class MyHotItem{
private String itemId;
private Long ts;
private Long hotCount;
public MyHotItem(){
}
public MyHotItem(String itemId, Long ts, Long hotCount) {
this.itemId = itemId;
this.ts = ts;
this.hotCount = hotCount;
}
@Override
public String toString() {
return "MyHotItem{" +
"itemId='" + itemId + '\'' +
", ts=" + ts +
", hotCount=" + hotCount +
'}';
}
}
@Data
class MyUserBehavior{
private String userId;
private String itemId;
private String catagoryId;
private String behavior;
private Long ts;
public MyUserBehavior(){
}
public MyUserBehavior(String userId, String itemId, String catagoryId, String behavior, Long ts) {
this.userId = userId;
this.itemId = itemId;
this.catagoryId = catagoryId;
this.behavior = behavior;
this.ts = ts;
}
@Override
public String toString() {
return "MyUserBehavior{" +
"userId='" + userId + '\'' +
", itemId='" + itemId + '\'' +
", catagoryId='" + catagoryId + '\'' +
", behavior='" + behavior + '\'' +
", ts=" + ts +
'}';
}
}
stage02 network total flow analysis
1.PageFlow(pv)
注意事项:1.时间窗口[,]是左开右开的
2.针对java transform的函数,我们尽量先拜读源码,查看相应的参数,在编程
3.需要特别注意的是:Java和java的api很容易混乱,尽量不要在同一个项目中进行编写
package com.wyj.networkFlowAnalysis;
import lombok.Data;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.timestamps.AscendingTimestampExtractor;
import org.apache.flink.streaming.api.windowing.time.Time;
/**
* 543462,1715,1464116,pv,1511658000
* 662867,2244074,1575622,pv,1511658000
*/
public class PageFlow {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
env.readTextFile(env.getClass().getResource("/UserBehavior.csv").getPath())
.map(new RichMapFunction<String, MyUserBehavior>() {
@Override
public MyUserBehavior map(String s) throws Exception {
String []fields = s.split(",");
return new MyUserBehavior(
fields[0],
fields[1],
fields[2],
fields[3],
Long.parseLong(fields[4])
);
}
})
//.assignTimestampsAndWatermarks(new MyTimestampsAndWatermarks_1())
//下面这个方式是匿名内部类
.assignTimestampsAndWatermarks(new AscendingTimestampExtractor<MyUserBehavior>() {
@Override
public long extractAscendingTimestamp(MyUserBehavior obj) {
return obj.getTs();
}
})
.filter(new FilterFunction<MyUserBehavior>() {
@Override
public boolean filter(MyUserBehavior myUserBehavior) throws Exception {
if(!"pv".equals(myUserBehavior.getBehavior()))
return false;
else return true;
}
})
//.keyBy("pv")
//.map(data -> (new Tuple2("pv" , 1)))//有存在lambda的错误,哎~~~
.map(new RichMapFunction<MyUserBehavior, Tuple2<String , Long>>() {
@Override
public Tuple2<String , Long> map(MyUserBehavior s) throws Exception {
return new Tuple2<>("pv" , 1L);
}
})
.keyBy(0)
.timeWindow(Time.hours(1))
.sum(1)
.print();
env.execute("pv count job");
}
}
@Data
class MyUserBehavior{
private String userId;
private String itemId;
private String catagoryId;
private String behavior;
private Long ts;
public MyUserBehavior(){
}
public MyUserBehavior(String userId, String itemId, String catagoryId, String behavior, Long ts) {
this.userId = userId;
this.itemId = itemId;
this.catagoryId = catagoryId;
this.behavior = behavior;
this.ts = ts;
}
@Override
public String toString() {
return "MyUserBehavior{" +
"userId='" + userId + '\'' +
", itemId='" + itemId + '\'' +
", catagoryId='" + catagoryId + '\'' +
", behavior='" + behavior + '\'' +
", ts=" + ts +
'}';
}
}
2.MyHotPage
注意事项:1.filter中可采用正则表达式进行数据过滤,有一个预定义正则貌似能加快速度
2.统计热度,思路大致是一样的,先keyBy itemId ,窗口,在进行排序统计
3.编程思想:先keyBy itemId 在aggregate(累加器,窗口函数[累加器的输出就是窗口函数的输入])
然后对窗口进行keyBy,然后采用process(自定义KeyedProcessFunction),
再使用定时器进行缓存数据以方便清理State,然后进行排序{例:records.sortBy(_.hot)(Ordering.Long.reverse).take(top3)}后输出
package com.wyj.networkFlowAnalysis;
import lombok.Data;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
//org.apache.flink.api.common.functions
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.io.Serializable;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 输出每个周期热门界面前3
*/
public class MyHotPage {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment().setParallelism(1);
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
env.readTextFile(env.getClass().getResource("/apache.log").getPath())
.map(data -> {
String[] fields = data.split(" ");
//System.out.println(fields.length);
return new MyApacheEvent(
fields[0],
fields[2],
new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss")
.parse(fields[3]).getTime(),
fields[5],
fields[6]
);
})
.filter(data -> {
/**
* scala Pattern
* val pattern = "^((?!\\.(css|js)$).)*$".r
* (pattern findFirstIn data.url).nonEmpty
*
*/
String pattern = "^((?!\\.(css|js)$).)*$"