大数据之实时数仓建设(一)

一、数据采集服务

在这里插入图片描述
日志数据从nginx到kafka, 有两种方式:
一种是nginx直接写到kafka,需要安装nginx-kafka插件,安装过程见 nginx-kafka的安装使用
另一种是,通过OpenResty收集数据,flume采集落地,再传给kafka,OpenResty安装过程见 OpenResty安装使用

2、编写nginx-kafka.conf

在nginx所在的机器上安装Flume,使用TailDirSource和KafkaChannel将数据采集到Kafka中,不需要sink。

a1.sources = r1
a1.channels = c1

a1.sources.r1.type = TAILDIR
a1.sources.r1.positionFile = /root/taildir_position.json
a1.sources.r1.filegroups = f1
a1.sources.r1.filegroups.f1 = /log/access-.*\.log

a1.channels.c1.type = org.apache.flume.channel.kafka.KafkaChannel
a1.channels.c1.kafka.bootstrap.servers = node-1.51doit.cn:9092,node-2.51doit.cn:9092,node-3.51doit.cn:9092
a1.channels.c1.kafka.topic = access19
a1.channels.c1.parseAsFlumeEvent = false

a1.sources.r1.channels = c1

启动flume:

在这里插入图片描述
二、Flink通用工具类

package cn._51doit.utils;

import org.apache.flink.api.common.serialization.DeserializationSchema;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.runtime.state.filesystem.FsStateBackend;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;

import java.util.Arrays;
import java.util.List;
import java.util.Properties;

public class FlinkUtils {

    public static final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

    public static ParameterTool parameterTool;

    public static <T> DataStream<T> createKafkaStream(String[] args, Class<? extends DeserializationSchema<T>> deserializer) throws Exception {

        parameterTool = ParameterTool.fromPropertiesFile(args[0]);

        String checkpointPath = parameterTool.getRequired("checkpoint.path");
		long checkpointInterval = parameterTool.getLong("checkpoint.interval", 30000);
		
		//开启checkpointing
		env.setStateBackend(new FsStateBackend(checkpointPath));
        env.enableCheckpointing(checkpointInterval, CheckpointingMode.EXACTLY_ONCE);
        
		//任务流取消和故障时会保留Checkpoint数据,以便根据实际需要恢复到指定的Checkpoint
        env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);

		//从Kafka中读取数据
		Properties properties = new Properties();//设置Kafka的地址和端口
		properties.setProperty("bootstrap.servers", parameterTool.getRequired("bootstrap.servers"));
		//读取偏移量策略:如果没有记录偏移量,就从头读,如果记录过偏移量,就接着读
        properties.setProperty("auto.offset.reset", parameterTool.getRequired("auto.offset.reset"));
        //设置消费者组ID
        properties.setProperty("group.id", parameterTool.getRequired("group.id"));
		//没有开启checkpoint,让flink提交偏移量的消费者定期自动提交偏移量
        properties.setProperty("enable.auto.commit", parameterTool.getRequired("enable.auto.commit"));
		//创建FlinkKafkaConsumer并传入相关参数
        String topics = parameterTool.getRequired("kafka.topics");
        List<String> topicList = Arrays.asList(topics.split(","));
		
        FlinkKafkaConsumer<T> kafkaConsumer = new FlinkKafkaConsumer<>(
                topicList, //要读取数据的Topic名称
                clazz.newInstance(), //读取文件的反序列化Schema
                properties //传入Kafka的参数
        );

		//在checkpoint时,不将偏移量写入到kafka特殊的topic中
        kafkaConsumer.setCommitOffsetsOnCheckpoints(false);

        return env.addSource(kafkaConsumer);

    }
}

2、调用工具类

package cn._51doit.jobs;

import cn._51doit.utils.FlinkUtils;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.DataStream;

public class PreEtl {

    public static void main(String[] args) throws Exception {

        DataStream<String> kafkaStream = FlinkUtils.createKafkaStream(args, SimpleStringSchema.class);

        //Transformation
        //sink
        kafkaStream.print();

        FlinkUtils.env.execute();

    }
}

3、conf.properties

checkpoint.interval=30000
checkpoint.path=hdfs://node-1.51doit.cn:9000/chk2021
bootstrap.servers=node-1.51doit.cn:9092,node-2.51doit.cn:9092,node-3.51doit.cn:9092
group.id=g10
enable.auto.commit=false
auto.offset.reset=earliest
kafka.topics=wordcount

三、项目需求

基础数据概览

在这里插入图片描述
在这里插入图片描述
三、根据系统、是否新老用户统计

package cn._51doit.jobs;

import cn._51doit.constant.EventID;
import cn._51doit.pojo.DataBean;
import cn._51doit.udf.JsonToBeanFunc;
import cn._51doit.utils.FlinkUtils;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;

/**
 * 当日新增用户的、老用户、活用户数量,并且加上维度(区域、系统、事件品牌)
 */
public class UserCount {

    public static void main(String[] args) throws Exception {

        DataStream<String> kafkaStream = FlinkUtils.createKafkaStream(args, SimpleStringSchema.class);

        //解析数据
        SingleOutputStreamOperator<DataBean> beanStream = kafkaStream.process(new JsonToBeanFunc());

        SingleOutputStreamOperator<DataBean> filtered = beanStream.filter(bean -> EventID.APP_LAUNCH.equals(bean.getEventId()));

        SingleOutputStreamOperator<Tuple3<String, Integer, Integer>> osNameIsNewAndOne = filtered
                .map(bean -> Tuple3.of(bean.getOsName(), bean.getIsNew(), 1))
                .returns(Types.TUPLE(Types.STRING, Types.INT, Types.INT));

        //按照OS和IsNew KeyBy
        SingleOutputStreamOperator<Tuple3<String, Integer, Integer>> res1 = osNameIsNewAndOne.keyBy(tp -> Tuple2.of(tp.f0, tp.f1), TypeInformation.of(new TypeHint<Tuple2<String, Integer>>() {
        })).sum(2);

        res1.print();


        res1.map(new MapFunction<Tuple3<String, Integer, Integer>, Tuple2<Integer, Integer>>() {
            @Override
            public Tuple2<Integer, Integer> map(Tuple3<String, Integer, Integer> value) throws Exception {
                return Tuple2.of(value.f1, value.f2);
            }
        }).keyBy(t -> t.f0).sum(1).print("total");

        //beanStream.print();

        FlinkUtils.env.execute();

    }
}

2、 DataBean

package cn._51doit.pojo;

import java.util.HashMap;

public class DataBean {

    private Integer id;
    //Unique Device Identifier,唯一设备标识码
    //private String udid;
    private String deviceId;

    private String guid;

    private String account;

    private String appId;

    private String appVersion;

    private String carrier;

    private String deviceType;

    private String eventId;

    private String ip;

    private Double latitude;

    private Double longitude;

    private String netType;

    private String osName;

    private String osVersion;

    private String releaseChannel;

    private String resolution;

    private String sessionId;

    private Long timestamp;

    private String newSessionId;

    private String country;

    private String province;

    private String city;

    private String region;

    private HashMap<String, Object> properties;

    private Long lastUpdate;

    private int isNew; //数据存在是否是一个新用户(通常不存在)

    //是不是新用户,如果为1为新用户,如果为0为老用户
    private int isN;

    public DataBean(){}


    public Integer getId() {
        return id;
    }

    public void setId(Integer id) {
        this.id = id;
    }

    public String getDeviceId() {
        return deviceId;
    }

    public void setDeviceId(String deviceId) {
        this.deviceId = deviceId;
    }

    public String getGuid() {
        return guid;
    }

    public void setGuid(String guid) {
        this.guid = guid;
    }

    public String getAccount() {
        return account;
    }

    public void setAccount(String account) {
        this.account = account;
    }

    public String getAppId() {
        return appId;
    }

    public void setAppId(String appId) {
        this.appId = appId;
    }

    public String getAppVersion() {
        return appVersion;
    }

    public void setAppVersion(String appVersion) {
        this.appVersion = appVersion;
    }

    public String getCarrier() {
        return carrier;
    }

    public void setCarrier(String carrier) {
        this.carrier = carrier;
    }

    public String getDeviceType() {
        return deviceType;
    }

    public void setDeviceType(String deviceType) {
        this.deviceType = deviceType;
    }

    public String getEventId() {
        return eventId;
    }

    public void setEventId(String eventId) {
        this.eventId = eventId;
    }

    public String getIp() {
        return ip;
    }

    public void setIp(String ip) {
        this.ip = ip;
    }

    public Double getLatitude() {
        return latitude;
    }

    public void setLatitude(Double latitude) {
        this.latitude = latitude;
    }

    public Double getLongitude() {
        return longitude;
    }

    public void setLongitude(Double longitude) {
        this.longitude = longitude;
    }

    public String getNetType() {
        return netType;
    }

    public void setNetType(String netType) {
        this.netType = netType;
    }

    public String getOsName() {
        return osName;
    }

    public void setOsName(String osName) {
        this.osName = osName;
    }

    public String getOsVersion() {
        return osVersion;
    }

    public void setOsVersion(String osVersion) {
        this.osVersion = osVersion;
    }

    public String getReleaseChannel() {
        return releaseChannel;
    }

    public void setReleaseChannel(String releaseChannel) {
        this.releaseChannel = releaseChannel;
    }

    public String getResolution() {
        return resolution;
    }

    public void setResolution(String resolution) {
        this.resolution = resolution;
    }

    public String getSessionId() {
        return sessionId;
    }

    public void setSessionId(String sessionId) {
        this.sessionId = sessionId;
    }

    public Long getTimestamp() {
        return timestamp;
    }

    public void setTimestamp(Long timestamp) {
        this.timestamp = timestamp;
    }

    public String getNewSessionId() {
        return newSessionId;
    }

    public void setNewSessionId(String newSessionId) {
        this.newSessionId = newSessionId;
    }

    public String getCountry() {
        return country;
    }

    public void setCountry(String country) {
        this.country = country;
    }

    public String getProvince() {
        return province;
    }

    public void setProvince(String province) {
        this.province = province;
    }

    public String getCity() {
        return city;
    }

    public void setCity(String city) {
        this.city = city;
    }

    public String getRegion() {
        return region;
    }

    public void setRegion(String region) {
        this.region = region;
    }

    public HashMap<String, Object> getProperties() {
        return properties;
    }

    public void setProperties(HashMap<String, Object> properties) {
        this.properties = properties;
    }

    public Long getLastUpdate() {
        return lastUpdate;
    }

    public void setLastUpdate(Long lastUpdate) {
        this.lastUpdate = lastUpdate;
    }

    public int getIsNew() {
        return isNew;
    }

    public void setIsNew(int isNew) {
        this.isNew = isNew;
    }

    public int getIsN() {
        return isN;
    }

    public void setIsN(int isN) {
        this.isN = isN;
    }

    @Override
    public String toString() {
        return "DataBean{" +
                "id=" + id +
                ", deviceId='" + deviceId + '\'' +
                ", guid='" + guid + '\'' +
                ", account='" + account + '\'' +
                ", appId='" + appId + '\'' +
                ", appVersion='" + appVersion + '\'' +
                ", carrier='" + carrier + '\'' +
                ", deviceType='" + deviceType + '\'' +
                ", eventId='" + eventId + '\'' +
                ", ip='" + ip + '\'' +
                ", latitude=" + latitude +
                ", longitude=" + longitude +
                ", netType='" + netType + '\'' +
                ", osName='" + osName + '\'' +
                ", osVersion='" + osVersion + '\'' +
                ", releaseChannel='" + releaseChannel + '\'' +
                ", resolution='" + resolution + '\'' +
                ", sessionId='" + sessionId + '\'' +
                ", timestamp=" + timestamp +
                ", newSessionId='" + newSessionId + '\'' +
                ", country='" + country + '\'' +
                ", province='" + province + '\'' +
                ", city='" + city + '\'' +
                ", region='" + region + '\'' +
                ", properties=" + properties +
                ", lastUpdate=" + lastUpdate +
                ", isNew=" + isN +
                '}';
    }
}

3.JsonToBeanFunc

package cn._51doit.udf;

import cn._51doit.pojo.DataBean;
import com.alibaba.fastjson.JSON;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.util.Collector;

/**
 * 将JSON字符串转成JavaBean
 */
public class JsonToBeanFunc extends ProcessFunction<String, DataBean> {

    @Override
    public void processElement(String value, Context ctx, Collector<DataBean> out) throws Exception {

        try {
            DataBean dataBean = JSON.parseObject(value, DataBean.class);
            out.collect(dataBean);
        } catch (Exception e) {
            //e.printStackTrace();
            //TODO 将有问题的数据保存起来
        }

    }
}

4、 EventID

package cn._51doit.constant;

public class EventID {

    public static final String APP_LAUNCH = "appLaunch";
}

四、根据省份统计新老用户

通过经纬度查询省份

1、LocationFunction

package cn._51doit.udf;

import cn._51doit.pojo.DataBean;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.async.AsyncFunction;
import org.apache.flink.streaming.api.functions.async.ResultFuture;
import org.apache.flink.streaming.api.functions.async.RichAsyncFunction;
import org.apache.http.HttpResponse;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.nio.client.CloseableHttpAsyncClient;
import org.apache.http.impl.nio.client.HttpAsyncClients;
import org.apache.http.util.EntityUtils;

import java.util.Collections;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Future;
import java.util.function.Supplier;

public class LocationFunction extends RichAsyncFunction<DataBean, DataBean> {

    private transient CloseableHttpAsyncClient httpclient; //异步请求的HttpClient
    private String url; //请求高德地图URL地址
    private String key; //请求高德地图的秘钥,注册高德地图开发者后获得
    private int maxConnTotal; //异步HTTPClient支持的最大连接

    public LocationFunction(String url, String key, int maxConnTotal) {
        this.url = url;
        this.key = key;
        this.maxConnTotal = maxConnTotal;
    }

    @Override
    public void open(Configuration parameters) throws Exception {
        RequestConfig requestConfig = RequestConfig.custom().build();
        httpclient = HttpAsyncClients.custom() //创建HttpAsyncClients请求连接池
                .setMaxConnTotal(maxConnTotal) //设置最大连接数
                .setDefaultRequestConfig(requestConfig).build();
        httpclient.start(); //启动异步请求httpClient
    }

    @Override
    public void asyncInvoke(DataBean bean, ResultFuture<DataBean> resultFuture) throws Exception {
        double longitude = bean.getLongitude(); //获取经度
        double latitude = bean.getLatitude(); //获取维度
        //将经纬度和高德地图的key与请求的url进行拼接
        HttpGet httpGet = new HttpGet(url + "?location=" + longitude + "," + latitude + "&key=" + key);
        //发送异步请求,返回Future
        Future<HttpResponse> future = httpclient.execute(httpGet, null);
        CompletableFuture.supplyAsync(new Supplier<DataBean>() {
            @Override
            public DataBean get() {
                try {
                    HttpResponse response = future.get();
                    String province = null;
                    String city = null;
                    if (response.getStatusLine().getStatusCode() == 200) {
                        //解析返回的结果,获取省份、城市等信息
                        String result = EntityUtils.toString(response.getEntity());
                        JSONObject jsonObj = JSON.parseObject(result);
                        JSONObject regeocode = jsonObj.getJSONObject("regeocode");
                        if (regeocode != null && !regeocode.isEmpty()) {
                            JSONObject address = regeocode.getJSONObject("addressComponent");
                            province = address.getString("province");
                            city = address.getString("city");
                        }
                    }
                    bean.setProvince(province); //将返回的结果给省份赋值
                    bean.setCity(city); //将返回的结果给城市赋值
                    return bean;
                } catch (Exception e) {
                    return null;
                }
            }
        }).thenAccept((DataBean result) -> {
            //将结果添加到resultFuture中输出(complete方法的参数只能为集合,如果只有一个元素,就返回一个单例集合)
            resultFuture.complete(Collections.singleton(result));
        });

    }

    @Override
    public void close() throws Exception {
        httpclient.close();
    }
}

2、UserCount2

package cn._51doit.jobs;

import cn._51doit.constant.EventID;
import cn._51doit.pojo.DataBean;
import cn._51doit.udf.JsonToBeanFunc;
import cn._51doit.udf.LocationFunction;
import cn._51doit.utils.FlinkUtils;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.common.time.Time;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.AsyncDataStream;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;

import java.util.concurrent.TimeUnit;

/**
 * 当日新增用户的、老用户、活用户数量,并且加上维度(区域、系统、事件品牌)
 */
public class UserCount2 {

    public static void main(String[] args) throws Exception {

        DataStream<String> kafkaStream = FlinkUtils.createKafkaStream(args, SimpleStringSchema.class);

        //解析数据
        SingleOutputStreamOperator<DataBean> beanStream = kafkaStream.process(new JsonToBeanFunc());

        //beanStream.map()
        SingleOutputStreamOperator<DataBean> filtered = beanStream.filter(bean -> EventID.APP_LAUNCH.equals(bean.getEventId()));

        String url = FlinkUtils.parameterTool.getRequired("amap.http.url");
        String key = FlinkUtils.parameterTool.getRequired("amap.key");

        SingleOutputStreamOperator<DataBean> dataBeanWithLocation = AsyncDataStream.unorderedWait(
                filtered, new LocationFunction(url, key, 50),
                5,
                TimeUnit.SECONDS
        );

        SingleOutputStreamOperator<Tuple3<String, Integer, Integer>> locationUserCount = dataBeanWithLocation.map(new MapFunction<DataBean, Tuple3<String, Integer, Integer>>() {
            @Override
            public Tuple3<String, Integer, Integer> map(DataBean value) throws Exception {
                return Tuple3.of(value.getProvince(), value.getIsNew(), 1);
            }
        }).keyBy(new KeySelector<Tuple3<String, Integer, Integer>, Tuple2<String, Integer>>() {
            @Override
            public Tuple2<String, Integer> getKey(Tuple3<String, Integer, Integer> value) throws Exception {
                return Tuple2.of(value.f0, value.f1);
            }
        }).sum(2);

        locationUserCount.print();

        FlinkUtils.env.execute();

    }
}

备注:通过异步IO的方式,先查询数据库里地址,若是没有再查询高德地图,然后完善数据库里地址

五、用户去重思路和案例

方式一: 传统去重

package cn._51doit.test;

import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.state.StateDescriptor;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.tuple.Tuple4;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;

import java.util.HashSet;

/**
 *
 * u1,A1,view
 * u1,A1,view
 * u1,A1,view
 * u1,A1,join
 * u1,A1,join
 * u2,A1,view
 * u2,A1,join
 * u1,A2,view
 * u1,A2,view
 * u1,A2,join
 *
 * 浏览次数:A1,view,4
 * 浏览人数:A1,view,2
 * 参与次数:A1,join,3
 * 参与人数:A1,join,2
 *
 */

public class ActivityCount {

    public static void main(String[] args) throws Exception {

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        //u1,A1,view
        DataStreamSource<String> lines = env.socketTextStream("localhost", 8888);

        SingleOutputStreamOperator<Tuple3<String, String, String>> tpStream = lines.map(new MapFunction<String, Tuple3<String, String, String>>() {
            @Override
            public Tuple3<String, String, String> map(String value) throws Exception {
                String[] fields = value.split(",");
                return Tuple3.of(fields[0], fields[1], fields[2]);
            }
        });

        //按照活动ID,事件ID进行keyBy,同一个活动、同一种事件的用户一定会进入到同一个分区
        KeyedStream<Tuple3<String, String, String>, Tuple2<String, String>> keyedStream = tpStream.keyBy(new KeySelector<Tuple3<String, String, String>, Tuple2<String, String>>() {
            @Override
            public Tuple2<String, String> getKey(Tuple3<String, String, String> value) throws Exception {
                return Tuple2.of(value.f1, value.f2);
            }
        });

        //在同一组内进行聚合
        keyedStream.process(new KeyedProcessFunction<Tuple2<String, String>, Tuple3<String, String, String>, Tuple4<String, String, Integer, Integer>>() {

            private transient ValueState<HashSet<String>> uidState;
            private transient ValueState<Integer> countState;

            @Override
            public void open(Configuration parameters) throws Exception {
                ValueStateDescriptor<HashSet<String>> stateDescriptor = new ValueStateDescriptor<>("uids-state", TypeInformation.of(new TypeHint<HashSet<String>>(){}));
                uidState = getRuntimeContext().getState(stateDescriptor);

                ValueStateDescriptor<Integer> countStateDescriptor = new ValueStateDescriptor<>("uid-count-state", Integer.class);
                countState = getRuntimeContext().getState(countStateDescriptor);

            }

            @Override
            public void processElement(Tuple3<String, String, String> value, Context ctx, Collector<Tuple4<String, String, Integer, Integer>> out) throws Exception {
                String uid = value.f0;

                HashSet<String> set = uidState.value();
                if(set == null) {
                    set = new HashSet<>();
                }
                set.add(uid);
                //更新状态
                uidState.update(set);

                Integer count = countState.value();
                if(count == null) {
                    count = 0;
                }
                count++;
                //更新状态
                countState.update(count);

                //输出
                out.collect(Tuple4.of(value.f1, value.f2, set.size(), count));
            }
        }).print();

        env.execute()

    }
}

方式二:使用布隆过滤器去重

布隆过滤器:

package cn._51doit.test;

import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.tuple.Tuple4;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.shaded.guava18.com.google.common.hash.BloomFilter;
import org.apache.flink.shaded.guava18.com.google.common.hash.Funnels;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;

import java.util.HashSet;

/**
 * u1,A1,view
 * u1,A1,view
 * u1,A1,view
 * u1,A1,join
 * u1,A1,join
 * u2,A1,view
 * u2,A1,join
 * u1,A2,view
 * u1,A2,view
 * u1,A2,join
 *
 * 浏览次数:A1,view,4
 * 浏览人数:A1,view,2

 * 参与次数:A1,join,3
 * 参与人数:A1,join,2
 *
 * 使用布隆过滤器进行去重
 */

public class ActivityCount2 {

    public static void main(String[] args) throws Exception {

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        //u1,A1,view
        DataStreamSource<String> lines = env.socketTextStream("localhost", 8888);

        SingleOutputStreamOperator<Tuple3<String, String, String>> tpStream = lines.map(new MapFunction<String, Tuple3<String, String, String>>() {
            @Override
            public Tuple3<String, String, String> map(String value) throws Exception {
                String[] fields = value.split(",");
                return Tuple3.of(fields[0], fields[1], fields[2]);
            }
        });

        //按照活动ID,事件ID进行keyBy,同一个活动、同一种事件的用户一定会进入到同一个分区
        KeyedStream<Tuple3<String, String, String>, Tuple2<String, String>> keyedStream = tpStream.keyBy(new KeySelector<Tuple3<String, String, String>, Tuple2<String, String>>() {
            @Override
            public Tuple2<String, String> getKey(Tuple3<String, String, String> value) throws Exception {
                return Tuple2.of(value.f1, value.f2);
            }
        });

        //在同一组内进行聚合
        keyedStream.process(new KeyedProcessFunction<Tuple2<String, String>, Tuple3<String, String, String>, Tuple4<String, String, Integer, Integer>>() {

            private transient ValueState<BloomFilter<String>> bloomFilterState;
            private transient ValueState<Integer> countState;
            private transient ValueState<Integer> uidCountState;

            @Override
            public void open(Configuration parameters) throws Exception {
                ValueStateDescriptor<BloomFilter<String>> bloomFilterDescriptor = new ValueStateDescriptor<>("uids-state", TypeInformation.of(new TypeHint<BloomFilter<String>>() {
                }));
                bloomFilterState = getRuntimeContext().getState(bloomFilterDescriptor);

                ValueStateDescriptor<Integer> uidCountStateDescriptor = new ValueStateDescriptor<>("uid-count-state", Integer.class);
                uidCountState = getRuntimeContext().getState(uidCountStateDescriptor);

                ValueStateDescriptor<Integer> countStateDescriptor = new ValueStateDescriptor<>("count-state", Integer.class);
                countState = getRuntimeContext().getState(countStateDescriptor);

            }

            @Override
            public void processElement(Tuple3<String, String, String> value, Context ctx, Collector<Tuple4<String, String, Integer, Integer>> out) throws Exception {
                String uid = value.f0;

                BloomFilter<String> bloomFilter = bloomFilterState.value();
                Integer uidCount = uidCountState.value();
                if (bloomFilter == null) {
                    bloomFilter = BloomFilter.create(Funnels.unencodedCharsFunnel(), 100000);
                    uidCount = 0;
                }
                //这个用户在布隆过滤器中一定不存在就添加到bloomFilter
                if (!bloomFilter.mightContain(uid)) {
                    bloomFilter.put(uid);
                    uidCount++;
                }

                //更新状态
                bloomFilterState.update(bloomFilter);
                uidCountState.update(uidCount);

                Integer count = countState.value();
                if (count == null) {
                    count = 0;
                }
                count++;
                //更新状态
                countState.update(count);

                //输出
                out.collect(Tuple4.of(value.f1, value.f2, uidCount, count));
            }
        }).print();

        env.execute()

    }
}

六、布隆过滤器计算新老用户

如果数据没有isNew字段,用一个用户放在用一个分区,用一个组里,记录是否新老用户

package cn._51doit.jobs;

import cn._51doit.constant.EventID;
import cn._51doit.pojo.DataBean;
import cn._51doit.udf.IsNewUserFunction;
import cn._51doit.udf.JsonToBeanFunc;
import cn._51doit.udf.LocationFunction;
import cn._51doit.utils.FlinkUtils;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.AsyncDataStream;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;

import java.util.concurrent.TimeUnit;

/**
 * 当日新增用户的、老用户、活用户数量,并且加上维度(区域、系统、事件品牌)
 */
public class UserCount3 {

    public static void main(String[] args) throws Exception {

        DataStream<String> kafkaStream = FlinkUtils.createKafkaStream(args, SimpleStringSchema.class);

        //解析数据
        SingleOutputStreamOperator<DataBean> beanStream = kafkaStream.process(new JsonToBeanFunc());

        //beanStream.map()
        SingleOutputStreamOperator<DataBean> filtered = beanStream.filter(bean -> EventID.APP_LAUNCH.equals(bean.getEventId()));

        //先计算当前的设备ID是不是一个新用户
        //按照手机型号进行KeyBy
        KeyedStream<DataBean, String> keyed = filtered.keyBy(DataBean::getDeviceType);

        keyed.process(new IsNewUserFunction()).print();

        FlinkUtils.env.execute();

    }
}

2、IsNewUserFunction

package cn._51doit.udf;

import cn._51doit.pojo.DataBean;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.shaded.guava18.com.google.common.hash.BloomFilter;
import org.apache.flink.shaded.guava18.com.google.common.hash.Funnels;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;

/**
 * 根据设备id(UDID)判断用户是不是新用户
 *
 * 先按照设备类型进行KeyBy
 */
public class IsNewUserFunction extends KeyedProcessFunction<String, DataBean, DataBean> {

    private transient ValueState<BloomFilter<String>> bloomFilterState;


    @Override
    public void open(Configuration parameters) throws Exception {
        ValueStateDescriptor<BloomFilter<String>> valueStateDescriptor = new ValueStateDescriptor<BloomFilter<String>>("uid-bloom-filter-state", TypeInformation.of(new TypeHint<BloomFilter<String>>() {}));
        bloomFilterState = getRuntimeContext().getState(valueStateDescriptor);

    }

    @Override
    public void processElement(DataBean bean, Context ctx, Collector<DataBean> out) throws Exception {

        String deviceId = bean.getDeviceId();

        BloomFilter<String> bloomFilter = bloomFilterState.value();
        if (bloomFilter == null) {
            bloomFilter = BloomFilter.create(Funnels.unencodedCharsFunnel(), 100000);
        }
        //判断该设备ID是不是在bloomFilter中
       if(!bloomFilter.mightContain(deviceId)) {
           bloomFilter.put(deviceId);
           bean.setIsN(1); //是新用户
           bloomFilterState.update(bloomFilter);
       }
       out.collect(bean);
    }
}

七、计算新用户优化

以上方法,某个型号手机数据量大,容易出现数据倾斜,可以一个分区用一个布隆过滤器,一个分区里放多少设备ID

package cn._51doit.jobs;

import cn._51doit.constant.EventID;
import cn._51doit.pojo.DataBean;
import cn._51doit.udf.IsNewUserFunc2;
import cn._51doit.udf.IsNewUserFunction;
import cn._51doit.udf.JsonToBeanFunc;
import cn._51doit.utils.FlinkUtils;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;

/**
 * 优化: 按照deviceID进行keyBy,使用operatorState,虽然一个分区中有多个组,一个分区中使用一个布隆过滤器
 */
public class UserCount4 {

    public static void main(String[] args) throws Exception {

        DataStream<String> kafkaStream = FlinkUtils.createKafkaStream(args, SimpleStringSchema.class);

        //解析数据
        SingleOutputStreamOperator<DataBean> beanStream = kafkaStream.process(new JsonToBeanFunc());

        //beanStream.map()
        SingleOutputStreamOperator<DataBean> filtered = beanStream.filter(bean -> EventID.APP_LAUNCH.equals(bean.getEventId()));

        //按照设备ID进行keyBy
        KeyedStream<DataBean, String> keyed = filtered.keyBy(DataBean::getDeviceId);

        SingleOutputStreamOperator<DataBean> res = keyed.map(new IsNewUserFunc2());
        
        res.print();

        FlinkUtils.env.execute();

    }
}

2、IsNewUserFunc2

package cn._51doit.udf;

import cn._51doit.pojo.DataBean;

import java.util.Collections;

import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.runtime.state.FunctionInitializationContext;
import org.apache.flink.runtime.state.FunctionSnapshotContext;
import org.apache.flink.shaded.guava18.com.google.common.hash.BloomFilter;
import org.apache.flink.shaded.guava18.com.google.common.hash.Funnels;
import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction;

/**
 *  使用operatorState
 */
public class IsNewUserFunc2 extends RichMapFunction<DataBean, DataBean> implements CheckpointedFunction {

	
	private transient ListState<BloomFilter<String>> listState ;
	private transient BloomFilter<String> bloomFilter;
	
    @Override 
    public void initializeState(FunctionInitializationContext context) throws Exception {
    	//只调用一次
    	ListStateDescriptor<BloomFilter<String>> stateDescriptor = new ListStateDescriptor<>("uid-bloom-filter-state", TypeInformation.of(new TypeHint() {
		}));
    	listState= context.getOperatorStateStore().getListState(stateDescriptor);
    	if(context.isRestored()){
    	
    		Iterable<BloomFilter<String>> iterable = listState.get();
    		for (BloomFilter<String> bloomFilter : iterable) {
				this.bloomFilter=bloomFilter;
			}

    	}
    	
    }

    @Override 
    public DataBean map(DataBean bean) throws Exception {
    	//来一条数据调用一次
    	String deviceId = bean.getDeviceId();
    	if(bloomFilter ==null){
    		bloomFilter=BloomFilter.create(Funnels.unencodedCharsFunnel(), 100000);
    	}
    	if(!bloomFilter.mightContain(deviceId)){
    		bloomFilter.put(deviceId);
    		bean.setIsN(1);
    	}
    	return bean;
        
    }

    @Override 
    public void snapshotState(FunctionSnapshotContext context) throws Exception {
    	//周期性调用
    	listState.update(Collections.singletonList(bloomFilter));
    }
}

3、使用RockDB作为StateBackend

优势:增量checkpoint

方式一:

导入依赖:

  <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-statebackend-rocksdb_2.11</artifactId>
            <version>${flink.version}</version>
<!--            <scope>provided</scope>-->
 </dependency>

修改代码:

 env.setStateBackend(new RocksDBStateBackend(checkpointPath, true));

方式二:

全局配置文件中flink-conf.yaml

#配置选项的冒号后面要有一个空格,不然会出错
state.backend: rocksdb
state.checkpoints.dir: hdfs://namenode-host:port/flink-checkpoints
state.backend.incremental: true 
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

大数据同盟会

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值