一、数据采集服务

日志数据从nginx到kafka, 有两种方式:
一种是nginx直接写到kafka,需要安装nginx-kafka插件,安装过程见 nginx-kafka的安装使用
另一种是,通过OpenResty收集数据,flume采集落地,再传给kafka,OpenResty安装过程见 OpenResty安装使用。
2、编写nginx-kafka.conf
在nginx所在的机器上安装Flume,使用TailDirSource和KafkaChannel将数据采集到Kafka中,不需要sink。
a1.sources = r1
a1.channels = c1
a1.sources.r1.type = TAILDIR
a1.sources.r1.positionFile = /root/taildir_position.json
a1.sources.r1.filegroups = f1
a1.sources.r1.filegroups.f1 = /log/access-.*\.log
a1.channels.c1.type = org.apache.flume.channel.kafka.KafkaChannel
a1.channels.c1.kafka.bootstrap.servers = node-1.51doit.cn:9092,node-2.51doit.cn:9092,node-3.51doit.cn:9092
a1.channels.c1.kafka.topic = access19
a1.channels.c1.parseAsFlumeEvent = false
a1.sources.r1.channels = c1
启动flume:

二、Flink通用工具类
package cn._51doit.utils;
import org.apache.flink.api.common.serialization.DeserializationSchema;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.runtime.state.filesystem.FsStateBackend;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
public class FlinkUtils {
public static final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
public static ParameterTool parameterTool;
public static <T> DataStream<T> createKafkaStream(String[] args, Class<? extends DeserializationSchema<T>> deserializer) throws Exception {
parameterTool = ParameterTool.fromPropertiesFile(args[0]);
String checkpointPath = parameterTool.getRequired("checkpoint.path");
long checkpointInterval = parameterTool.getLong("checkpoint.interval", 30000);
//开启checkpointing
env.setStateBackend(new FsStateBackend(checkpointPath));
env.enableCheckpointing(checkpointInterval, CheckpointingMode.EXACTLY_ONCE);
//任务流取消和故障时会保留Checkpoint数据,以便根据实际需要恢复到指定的Checkpoint
env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
//从Kafka中读取数据
Properties properties = new Properties();//设置Kafka的地址和端口
properties.setProperty("bootstrap.servers", parameterTool.getRequired("bootstrap.servers"));
//读取偏移量策略:如果没有记录偏移量,就从头读,如果记录过偏移量,就接着读
properties.setProperty("auto.offset.reset", parameterTool.getRequired("auto.offset.reset"));
//设置消费者组ID
properties.setProperty("group.id", parameterTool.getRequired("group.id"));
//没有开启checkpoint,让flink提交偏移量的消费者定期自动提交偏移量
properties.setProperty("enable.auto.commit", parameterTool.getRequired("enable.auto.commit"));
//创建FlinkKafkaConsumer并传入相关参数
String topics = parameterTool.getRequired("kafka.topics");
List<String> topicList = Arrays.asList(topics.split(","));
FlinkKafkaConsumer<T> kafkaConsumer = new FlinkKafkaConsumer<>(
topicList, //要读取数据的Topic名称
clazz.newInstance(), //读取文件的反序列化Schema
properties //传入Kafka的参数
);
//在checkpoint时,不将偏移量写入到kafka特殊的topic中
kafkaConsumer.setCommitOffsetsOnCheckpoints(false);
return env.addSource(kafkaConsumer);
}
}
2、调用工具类
package cn._51doit.jobs;
import cn._51doit.utils.FlinkUtils;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.DataStream;
public class PreEtl {
public static void main(String[] args) throws Exception {
DataStream<String> kafkaStream = FlinkUtils.createKafkaStream(args, SimpleStringSchema.class);
//Transformation
//sink
kafkaStream.print();
FlinkUtils.env.execute();
}
}
3、conf.properties
checkpoint.interval=30000
checkpoint.path=hdfs://node-1.51doit.cn:9000/chk2021
bootstrap.servers=node-1.51doit.cn:9092,node-2.51doit.cn:9092,node-3.51doit.cn:9092
group.id=g10
enable.auto.commit=false
auto.offset.reset=earliest
kafka.topics=wordcount
三、项目需求
基础数据概览


三、根据系统、是否新老用户统计
package cn._51doit.jobs;
import cn._51doit.constant.EventID;
import cn._51doit.pojo.DataBean;
import cn._51doit.udf.JsonToBeanFunc;
import cn._51doit.utils.FlinkUtils;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
/**
* 当日新增用户的、老用户、活用户数量,并且加上维度(区域、系统、事件品牌)
*/
public class UserCount {
public static void main(String[] args) throws Exception {
DataStream<String> kafkaStream = FlinkUtils.createKafkaStream(args, SimpleStringSchema.class);
//解析数据
SingleOutputStreamOperator<DataBean> beanStream = kafkaStream.process(new JsonToBeanFunc());
SingleOutputStreamOperator<DataBean> filtered = beanStream.filter(bean -> EventID.APP_LAUNCH.equals(bean.getEventId()));
SingleOutputStreamOperator<Tuple3<String, Integer, Integer>> osNameIsNewAndOne = filtered
.map(bean -> Tuple3.of(bean.getOsName(), bean.getIsNew(), 1))
.returns(Types.TUPLE(Types.STRING, Types.INT, Types.INT));
//按照OS和IsNew KeyBy
SingleOutputStreamOperator<Tuple3<String, Integer, Integer>> res1 = osNameIsNewAndOne.keyBy(tp -> Tuple2.of(tp.f0, tp.f1), TypeInformation.of(new TypeHint<Tuple2<String, Integer>>() {
})).sum(2);
res1.print();
res1.map(new MapFunction<Tuple3<String, Integer, Integer>, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Integer> map(Tuple3<String, Integer, Integer> value) throws Exception {
return Tuple2.of(value.f1, value.f2);
}
}).keyBy(t -> t.f0).sum(1).print("total");
//beanStream.print();
FlinkUtils.env.execute();
}
}
2、 DataBean
package cn._51doit.pojo;
import java.util.HashMap;
public class DataBean {
private Integer id;
//Unique Device Identifier,唯一设备标识码
//private String udid;
private String deviceId;
private String guid;
private String account;
private String appId;
private String appVersion;
private String carrier;
private String deviceType;
private String eventId;
private String ip;
private Double latitude;
private Double longitude;
private String netType;
private String osName;
private String osVersion;
private String releaseChannel;
private String resolution;
private String sessionId;
private Long timestamp;
private String newSessionId;
private String country;
private String province;
private String city;
private String region;
private HashMap<String, Object> properties;
private Long lastUpdate;
private int isNew; //数据存在是否是一个新用户(通常不存在)
//是不是新用户,如果为1为新用户,如果为0为老用户
private int isN;
public DataBean(){}
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public String getDeviceId() {
return deviceId;
}
public void setDeviceId(String deviceId) {
this.deviceId = deviceId;
}
public String getGuid() {
return guid;
}
public void setGuid(String guid) {
this.guid = guid;
}
public String getAccount() {
return account;
}
public void setAccount(String account) {
this.account = account;
}
public String getAppId() {
return appId;
}
public void setAppId(String appId) {
this.appId = appId;
}
public String getAppVersion() {
return appVersion;
}
public void setAppVersion(String appVersion) {
this.appVersion = appVersion;
}
public String getCarrier() {
return carrier;
}
public void setCarrier(String carrier) {
this.carrier = carrier;
}
public String getDeviceType() {
return deviceType;
}
public void setDeviceType(String deviceType) {
this.deviceType = deviceType;
}
public String getEventId() {
return eventId;
}
public void setEventId(String eventId) {
this.eventId = eventId;
}
public String getIp() {
return ip;
}
public void setIp(String ip) {
this.ip = ip;
}
public Double getLatitude() {
return latitude;
}
public void setLatitude(Double latitude) {
this.latitude = latitude;
}
public Double getLongitude() {
return longitude;
}
public void setLongitude(Double longitude) {
this.longitude = longitude;
}
public String getNetType() {
return netType;
}
public void setNetType(String netType) {
this.netType = netType;
}
public String getOsName() {
return osName;
}
public void setOsName(String osName) {
this.osName = osName;
}
public String getOsVersion() {
return osVersion;
}
public void setOsVersion(String osVersion) {
this.osVersion = osVersion;
}
public String getReleaseChannel() {
return releaseChannel;
}
public void setReleaseChannel(String releaseChannel) {
this.releaseChannel = releaseChannel;
}
public String getResolution() {
return resolution;
}
public void setResolution(String resolution) {
this.resolution = resolution;
}
public String getSessionId() {
return sessionId;
}
public void setSessionId(String sessionId) {
this.sessionId = sessionId;
}
public Long getTimestamp() {
return timestamp;
}
public void setTimestamp(Long timestamp) {
this.timestamp = timestamp;
}
public String getNewSessionId() {
return newSessionId;
}
public void setNewSessionId(String newSessionId) {
this.newSessionId = newSessionId;
}
public String getCountry() {
return country;
}
public void setCountry(String country) {
this.country = country;
}
public String getProvince() {
return province;
}
public void setProvince(String province) {
this.province = province;
}
public String getCity() {
return city;
}
public void setCity(String city) {
this.city = city;
}
public String getRegion() {
return region;
}
public void setRegion(String region) {
this.region = region;
}
public HashMap<String, Object> getProperties() {
return properties;
}
public void setProperties(HashMap<String, Object> properties) {
this.properties = properties;
}
public Long getLastUpdate() {
return lastUpdate;
}
public void setLastUpdate(Long lastUpdate) {
this.lastUpdate = lastUpdate;
}
public int getIsNew() {
return isNew;
}
public void setIsNew(int isNew) {
this.isNew = isNew;
}
public int getIsN() {
return isN;
}
public void setIsN(int isN) {
this.isN = isN;
}
@Override
public String toString() {
return "DataBean{" +
"id=" + id +
", deviceId='" + deviceId + '\'' +
", guid='" + guid + '\'' +
", account='" + account + '\'' +
", appId='" + appId + '\'' +
", appVersion='" + appVersion + '\'' +
", carrier='" + carrier + '\'' +
", deviceType='" + deviceType + '\'' +
", eventId='" + eventId + '\'' +
", ip='" + ip + '\'' +
", latitude=" + latitude +
", longitude=" + longitude +
", netType='" + netType + '\'' +
", osName='" + osName + '\'' +
", osVersion='" + osVersion + '\'' +
", releaseChannel='" + releaseChannel + '\'' +
", resolution='" + resolution + '\'' +
", sessionId='" + sessionId + '\'' +
", timestamp=" + timestamp +
", newSessionId='" + newSessionId + '\'' +
", country='" + country + '\'' +
", province='" + province + '\'' +
", city='" + city + '\'' +
", region='" + region + '\'' +
", properties=" + properties +
", lastUpdate=" + lastUpdate +
", isNew=" + isN +
'}';
}
}
3.JsonToBeanFunc
package cn._51doit.udf;
import cn._51doit.pojo.DataBean;
import com.alibaba.fastjson.JSON;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.util.Collector;
/**
* 将JSON字符串转成JavaBean
*/
public class JsonToBeanFunc extends ProcessFunction<String, DataBean> {
@Override
public void processElement(String value, Context ctx, Collector<DataBean> out) throws Exception {
try {
DataBean dataBean = JSON.parseObject(value, DataBean.class);
out.collect(dataBean);
} catch (Exception e) {
//e.printStackTrace();
//TODO 将有问题的数据保存起来
}
}
}
4、 EventID
package cn._51doit.constant;
public class EventID {
public static final String APP_LAUNCH = "appLaunch";
}
四、根据省份统计新老用户
通过经纬度查询省份
1、LocationFunction
package cn._51doit.udf;
import cn._51doit.pojo.DataBean;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.async.AsyncFunction;
import org.apache.flink.streaming.api.functions.async.ResultFuture;
import org.apache.flink.streaming.api.functions.async.RichAsyncFunction;
import org.apache.http.HttpResponse;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.nio.client.CloseableHttpAsyncClient;
import org.apache.http.impl.nio.client.HttpAsyncClients;
import org.apache.http.util.EntityUtils;
import java.util.Collections;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Future;
import java.util.function.Supplier;
public class LocationFunction extends RichAsyncFunction<DataBean, DataBean> {
private transient CloseableHttpAsyncClient httpclient; //异步请求的HttpClient
private String url; //请求高德地图URL地址
private String key; //请求高德地图的秘钥,注册高德地图开发者后获得
private int maxConnTotal; //异步HTTPClient支持的最大连接
public LocationFunction(String url, String key, int maxConnTotal) {
this.url = url;
this.key = key;
this.maxConnTotal = maxConnTotal;
}
@Override
public void open(Configuration parameters) throws Exception {
RequestConfig requestConfig = RequestConfig.custom().build();
httpclient = HttpAsyncClients.custom() //创建HttpAsyncClients请求连接池
.setMaxConnTotal(maxConnTotal) //设置最大连接数
.setDefaultRequestConfig(requestConfig).build();
httpclient.start(); //启动异步请求httpClient
}
@Override
public void asyncInvoke(DataBean bean, ResultFuture<DataBean> resultFuture) throws Exception {
double longitude = bean.getLongitude(); //获取经度
double latitude = bean.getLatitude(); //获取维度
//将经纬度和高德地图的key与请求的url进行拼接
HttpGet httpGet = new HttpGet(url + "?location=" + longitude + "," + latitude + "&key=" + key);
//发送异步请求,返回Future
Future<HttpResponse> future = httpclient.execute(httpGet, null);
CompletableFuture.supplyAsync(new Supplier<DataBean>() {
@Override
public DataBean get() {
try {
HttpResponse response = future.get();
String province = null;
String city = null;
if (response.getStatusLine().getStatusCode() == 200) {
//解析返回的结果,获取省份、城市等信息
String result = EntityUtils.toString(response.getEntity());
JSONObject jsonObj = JSON.parseObject(result);
JSONObject regeocode = jsonObj.getJSONObject("regeocode");
if (regeocode != null && !regeocode.isEmpty()) {
JSONObject address = regeocode.getJSONObject("addressComponent");
province = address.getString("province");
city = address.getString("city");
}
}
bean.setProvince(province); //将返回的结果给省份赋值
bean.setCity(city); //将返回的结果给城市赋值
return bean;
} catch (Exception e) {
return null;
}
}
}).thenAccept((DataBean result) -> {
//将结果添加到resultFuture中输出(complete方法的参数只能为集合,如果只有一个元素,就返回一个单例集合)
resultFuture.complete(Collections.singleton(result));
});
}
@Override
public void close() throws Exception {
httpclient.close();
}
}
2、UserCount2
package cn._51doit.jobs;
import cn._51doit.constant.EventID;
import cn._51doit.pojo.DataBean;
import cn._51doit.udf.JsonToBeanFunc;
import cn._51doit.udf.LocationFunction;
import cn._51doit.utils.FlinkUtils;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.common.time.Time;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.AsyncDataStream;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import java.util.concurrent.TimeUnit;
/**
* 当日新增用户的、老用户、活用户数量,并且加上维度(区域、系统、事件品牌)
*/
public class UserCount2 {
public static void main(String[] args) throws Exception {
DataStream<String> kafkaStream = FlinkUtils.createKafkaStream(args, SimpleStringSchema.class);
//解析数据
SingleOutputStreamOperator<DataBean> beanStream = kafkaStream.process(new JsonToBeanFunc());
//beanStream.map()
SingleOutputStreamOperator<DataBean> filtered = beanStream.filter(bean -> EventID.APP_LAUNCH.equals(bean.getEventId()));
String url = FlinkUtils.parameterTool.getRequired("amap.http.url");
String key = FlinkUtils.parameterTool.getRequired("amap.key");
SingleOutputStreamOperator<DataBean> dataBeanWithLocation = AsyncDataStream.unorderedWait(
filtered, new LocationFunction(url, key, 50),
5,
TimeUnit.SECONDS
);
SingleOutputStreamOperator<Tuple3<String, Integer, Integer>> locationUserCount = dataBeanWithLocation.map(new MapFunction<DataBean, Tuple3<String, Integer, Integer>>() {
@Override
public Tuple3<String, Integer, Integer> map(DataBean value) throws Exception {
return Tuple3.of(value.getProvince(), value.getIsNew(), 1);
}
}).keyBy(new KeySelector<Tuple3<String, Integer, Integer>, Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Integer> getKey(Tuple3<String, Integer, Integer> value) throws Exception {
return Tuple2.of(value.f0, value.f1);
}
}).sum(2);
locationUserCount.print();
FlinkUtils.env.execute();
}
}
备注:通过异步IO的方式,先查询数据库里地址,若是没有再查询高德地图,然后完善数据库里地址
五、用户去重思路和案例
方式一: 传统去重
package cn._51doit.test;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.state.StateDescriptor;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.tuple.Tuple4;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
import java.util.HashSet;
/**
*
* u1,A1,view
* u1,A1,view
* u1,A1,view
* u1,A1,join
* u1,A1,join
* u2,A1,view
* u2,A1,join
* u1,A2,view
* u1,A2,view
* u1,A2,join
*
* 浏览次数:A1,view,4
* 浏览人数:A1,view,2
* 参与次数:A1,join,3
* 参与人数:A1,join,2
*
*/
public class ActivityCount {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//u1,A1,view
DataStreamSource<String> lines = env.socketTextStream("localhost", 8888);
SingleOutputStreamOperator<Tuple3<String, String, String>> tpStream = lines.map(new MapFunction<String, Tuple3<String, String, String>>() {
@Override
public Tuple3<String, String, String> map(String value) throws Exception {
String[] fields = value.split(",");
return Tuple3.of(fields[0], fields[1], fields[2]);
}
});
//按照活动ID,事件ID进行keyBy,同一个活动、同一种事件的用户一定会进入到同一个分区
KeyedStream<Tuple3<String, String, String>, Tuple2<String, String>> keyedStream = tpStream.keyBy(new KeySelector<Tuple3<String, String, String>, Tuple2<String, String>>() {
@Override
public Tuple2<String, String> getKey(Tuple3<String, String, String> value) throws Exception {
return Tuple2.of(value.f1, value.f2);
}
});
//在同一组内进行聚合
keyedStream.process(new KeyedProcessFunction<Tuple2<String, String>, Tuple3<String, String, String>, Tuple4<String, String, Integer, Integer>>() {
private transient ValueState<HashSet<String>> uidState;
private transient ValueState<Integer> countState;
@Override
public void open(Configuration parameters) throws Exception {
ValueStateDescriptor<HashSet<String>> stateDescriptor = new ValueStateDescriptor<>("uids-state", TypeInformation.of(new TypeHint<HashSet<String>>(){}));
uidState = getRuntimeContext().getState(stateDescriptor);
ValueStateDescriptor<Integer> countStateDescriptor = new ValueStateDescriptor<>("uid-count-state", Integer.class);
countState = getRuntimeContext().getState(countStateDescriptor);
}
@Override
public void processElement(Tuple3<String, String, String> value, Context ctx, Collector<Tuple4<String, String, Integer, Integer>> out) throws Exception {
String uid = value.f0;
HashSet<String> set = uidState.value();
if(set == null) {
set = new HashSet<>();
}
set.add(uid);
//更新状态
uidState.update(set);
Integer count = countState.value();
if(count == null) {
count = 0;
}
count++;
//更新状态
countState.update(count);
//输出
out.collect(Tuple4.of(value.f1, value.f2, set.size(), count));
}
}).print();
env.execute()
}
}
方式二:使用布隆过滤器去重
布隆过滤器:
package cn._51doit.test;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.tuple.Tuple4;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.shaded.guava18.com.google.common.hash.BloomFilter;
import org.apache.flink.shaded.guava18.com.google.common.hash.Funnels;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
import java.util.HashSet;
/**
* u1,A1,view
* u1,A1,view
* u1,A1,view
* u1,A1,join
* u1,A1,join
* u2,A1,view
* u2,A1,join
* u1,A2,view
* u1,A2,view
* u1,A2,join
*
* 浏览次数:A1,view,4
* 浏览人数:A1,view,2
* 参与次数:A1,join,3
* 参与人数:A1,join,2
*
* 使用布隆过滤器进行去重
*/
public class ActivityCount2 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//u1,A1,view
DataStreamSource<String> lines = env.socketTextStream("localhost", 8888);
SingleOutputStreamOperator<Tuple3<String, String, String>> tpStream = lines.map(new MapFunction<String, Tuple3<String, String, String>>() {
@Override
public Tuple3<String, String, String> map(String value) throws Exception {
String[] fields = value.split(",");
return Tuple3.of(fields[0], fields[1], fields[2]);
}
});
//按照活动ID,事件ID进行keyBy,同一个活动、同一种事件的用户一定会进入到同一个分区
KeyedStream<Tuple3<String, String, String>, Tuple2<String, String>> keyedStream = tpStream.keyBy(new KeySelector<Tuple3<String, String, String>, Tuple2<String, String>>() {
@Override
public Tuple2<String, String> getKey(Tuple3<String, String, String> value) throws Exception {
return Tuple2.of(value.f1, value.f2);
}
});
//在同一组内进行聚合
keyedStream.process(new KeyedProcessFunction<Tuple2<String, String>, Tuple3<String, String, String>, Tuple4<String, String, Integer, Integer>>() {
private transient ValueState<BloomFilter<String>> bloomFilterState;
private transient ValueState<Integer> countState;
private transient ValueState<Integer> uidCountState;
@Override
public void open(Configuration parameters) throws Exception {
ValueStateDescriptor<BloomFilter<String>> bloomFilterDescriptor = new ValueStateDescriptor<>("uids-state", TypeInformation.of(new TypeHint<BloomFilter<String>>() {
}));
bloomFilterState = getRuntimeContext().getState(bloomFilterDescriptor);
ValueStateDescriptor<Integer> uidCountStateDescriptor = new ValueStateDescriptor<>("uid-count-state", Integer.class);
uidCountState = getRuntimeContext().getState(uidCountStateDescriptor);
ValueStateDescriptor<Integer> countStateDescriptor = new ValueStateDescriptor<>("count-state", Integer.class);
countState = getRuntimeContext().getState(countStateDescriptor);
}
@Override
public void processElement(Tuple3<String, String, String> value, Context ctx, Collector<Tuple4<String, String, Integer, Integer>> out) throws Exception {
String uid = value.f0;
BloomFilter<String> bloomFilter = bloomFilterState.value();
Integer uidCount = uidCountState.value();
if (bloomFilter == null) {
bloomFilter = BloomFilter.create(Funnels.unencodedCharsFunnel(), 100000);
uidCount = 0;
}
//这个用户在布隆过滤器中一定不存在就添加到bloomFilter
if (!bloomFilter.mightContain(uid)) {
bloomFilter.put(uid);
uidCount++;
}
//更新状态
bloomFilterState.update(bloomFilter);
uidCountState.update(uidCount);
Integer count = countState.value();
if (count == null) {
count = 0;
}
count++;
//更新状态
countState.update(count);
//输出
out.collect(Tuple4.of(value.f1, value.f2, uidCount, count));
}
}).print();
env.execute()
}
}
六、布隆过滤器计算新老用户
如果数据没有isNew字段,用一个用户放在用一个分区,用一个组里,记录是否新老用户
package cn._51doit.jobs;
import cn._51doit.constant.EventID;
import cn._51doit.pojo.DataBean;
import cn._51doit.udf.IsNewUserFunction;
import cn._51doit.udf.JsonToBeanFunc;
import cn._51doit.udf.LocationFunction;
import cn._51doit.utils.FlinkUtils;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.AsyncDataStream;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
import java.util.concurrent.TimeUnit;
/**
* 当日新增用户的、老用户、活用户数量,并且加上维度(区域、系统、事件品牌)
*/
public class UserCount3 {
public static void main(String[] args) throws Exception {
DataStream<String> kafkaStream = FlinkUtils.createKafkaStream(args, SimpleStringSchema.class);
//解析数据
SingleOutputStreamOperator<DataBean> beanStream = kafkaStream.process(new JsonToBeanFunc());
//beanStream.map()
SingleOutputStreamOperator<DataBean> filtered = beanStream.filter(bean -> EventID.APP_LAUNCH.equals(bean.getEventId()));
//先计算当前的设备ID是不是一个新用户
//按照手机型号进行KeyBy
KeyedStream<DataBean, String> keyed = filtered.keyBy(DataBean::getDeviceType);
keyed.process(new IsNewUserFunction()).print();
FlinkUtils.env.execute();
}
}
2、IsNewUserFunction
package cn._51doit.udf;
import cn._51doit.pojo.DataBean;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.shaded.guava18.com.google.common.hash.BloomFilter;
import org.apache.flink.shaded.guava18.com.google.common.hash.Funnels;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
/**
* 根据设备id(UDID)判断用户是不是新用户
*
* 先按照设备类型进行KeyBy
*/
public class IsNewUserFunction extends KeyedProcessFunction<String, DataBean, DataBean> {
private transient ValueState<BloomFilter<String>> bloomFilterState;
@Override
public void open(Configuration parameters) throws Exception {
ValueStateDescriptor<BloomFilter<String>> valueStateDescriptor = new ValueStateDescriptor<BloomFilter<String>>("uid-bloom-filter-state", TypeInformation.of(new TypeHint<BloomFilter<String>>() {}));
bloomFilterState = getRuntimeContext().getState(valueStateDescriptor);
}
@Override
public void processElement(DataBean bean, Context ctx, Collector<DataBean> out) throws Exception {
String deviceId = bean.getDeviceId();
BloomFilter<String> bloomFilter = bloomFilterState.value();
if (bloomFilter == null) {
bloomFilter = BloomFilter.create(Funnels.unencodedCharsFunnel(), 100000);
}
//判断该设备ID是不是在bloomFilter中
if(!bloomFilter.mightContain(deviceId)) {
bloomFilter.put(deviceId);
bean.setIsN(1); //是新用户
bloomFilterState.update(bloomFilter);
}
out.collect(bean);
}
}
七、计算新用户优化
以上方法,某个型号手机数据量大,容易出现数据倾斜,可以一个分区用一个布隆过滤器,一个分区里放多少设备ID
package cn._51doit.jobs;
import cn._51doit.constant.EventID;
import cn._51doit.pojo.DataBean;
import cn._51doit.udf.IsNewUserFunc2;
import cn._51doit.udf.IsNewUserFunction;
import cn._51doit.udf.JsonToBeanFunc;
import cn._51doit.utils.FlinkUtils;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
/**
* 优化: 按照deviceID进行keyBy,使用operatorState,虽然一个分区中有多个组,一个分区中使用一个布隆过滤器
*/
public class UserCount4 {
public static void main(String[] args) throws Exception {
DataStream<String> kafkaStream = FlinkUtils.createKafkaStream(args, SimpleStringSchema.class);
//解析数据
SingleOutputStreamOperator<DataBean> beanStream = kafkaStream.process(new JsonToBeanFunc());
//beanStream.map()
SingleOutputStreamOperator<DataBean> filtered = beanStream.filter(bean -> EventID.APP_LAUNCH.equals(bean.getEventId()));
//按照设备ID进行keyBy
KeyedStream<DataBean, String> keyed = filtered.keyBy(DataBean::getDeviceId);
SingleOutputStreamOperator<DataBean> res = keyed.map(new IsNewUserFunc2());
res.print();
FlinkUtils.env.execute();
}
}
2、IsNewUserFunc2
package cn._51doit.udf;
import cn._51doit.pojo.DataBean;
import java.util.Collections;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.runtime.state.FunctionInitializationContext;
import org.apache.flink.runtime.state.FunctionSnapshotContext;
import org.apache.flink.shaded.guava18.com.google.common.hash.BloomFilter;
import org.apache.flink.shaded.guava18.com.google.common.hash.Funnels;
import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction;
/**
* 使用operatorState
*/
public class IsNewUserFunc2 extends RichMapFunction<DataBean, DataBean> implements CheckpointedFunction {
private transient ListState<BloomFilter<String>> listState ;
private transient BloomFilter<String> bloomFilter;
@Override
public void initializeState(FunctionInitializationContext context) throws Exception {
//只调用一次
ListStateDescriptor<BloomFilter<String>> stateDescriptor = new ListStateDescriptor<>("uid-bloom-filter-state", TypeInformation.of(new TypeHint() {
}));
listState= context.getOperatorStateStore().getListState(stateDescriptor);
if(context.isRestored()){
Iterable<BloomFilter<String>> iterable = listState.get();
for (BloomFilter<String> bloomFilter : iterable) {
this.bloomFilter=bloomFilter;
}
}
}
@Override
public DataBean map(DataBean bean) throws Exception {
//来一条数据调用一次
String deviceId = bean.getDeviceId();
if(bloomFilter ==null){
bloomFilter=BloomFilter.create(Funnels.unencodedCharsFunnel(), 100000);
}
if(!bloomFilter.mightContain(deviceId)){
bloomFilter.put(deviceId);
bean.setIsN(1);
}
return bean;
}
@Override
public void snapshotState(FunctionSnapshotContext context) throws Exception {
//周期性调用
listState.update(Collections.singletonList(bloomFilter));
}
}
3、使用RockDB作为StateBackend
优势:增量checkpoint
方式一:
导入依赖:
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-statebackend-rocksdb_2.11</artifactId>
<version>${flink.version}</version>
<!-- <scope>provided</scope>-->
</dependency>
修改代码:
env.setStateBackend(new RocksDBStateBackend(checkpointPath, true));
方式二:
全局配置文件中flink-conf.yaml
#配置选项的冒号后面要有一个空格,不然会出错
state.backend: rocksdb
state.checkpoints.dir: hdfs://namenode-host:port/flink-checkpoints
state.backend.incremental: true
847

被折叠的 条评论
为什么被折叠?



