Flink项目之电商实时数据分析(三)

Flink项目之电商实时数据分析(三)

本文承接上一篇(二):https://blog.csdn.net/weixin_38255444/article/details/104820912

五:flink实时业务开发

  1. maven文件导入

    <?xml version="1.0" encoding="UTF-8"?>
    <project xmlns="http://maven.apache.org/POM/4.0.0"
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
    http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <parent>
    <artifactId>FlinkCase</artifactId>
    <groupId>com.ityouxin</groupId>
    <version>1.0-SNAPSHOT</version>
    </parent>
    <modelVersion>4.0.0</modelVersion>
    <artifactId>realProcess</artifactId>
    <properties>
    <scala.binary.version>2.11</scala.binary.version>
    <flink.version>1.6.0</flink.version>
    <hadoop.version>2.6.0</hadoop.version>
    <hbase.version>1.2.0</hbase.version>
    <cdh.version>cdh5.14.0</cdh.version>
    </properties>
    <repositories>
    <repository>
    <id>cloudera</id>
    <url>https://repository.cloudera.com/artifactory/clouderarepos</url>
    </repository>
    </repositories>
    <dependencies>
    <dependency>
    <groupId>org.apache.flink</groupId>
    <artifactId>flink-connector-kafka-0.9_${scala.binary.version}
    </artifactId>
    <version>${flink.version}</version>
    </dependency><!-- https://mvnrepository.com/artifact/org.apache.flink/flinkshaded-jackson -->
    <dependency>
    <groupId>org.apache.flink</groupId>
    <artifactId>flink-shaded-jackson</artifactId>
    <version>2.7.9-2.0</version>
    </dependency>
    <dependency>
    <groupId>com.alibaba</groupId>
    <artifactId>fastjson</artifactId>
    <version>1.2.47</version>
    </dependency>
    <dependency>
    <groupId>redis.clients</groupId>
    <artifactId>jedis</artifactId>
    <version>2.9.0</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.hbase/hbaseclient -->
    <dependency>
    <groupId>org.apache.hbase</groupId>
    <artifactId>hbase-client</artifactId>
    <version>${hbase.version}-${cdh.version}</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.hbase/hbaseserver -->
    <!-- https://mvnrepository.com/artifact/org.apache.hbase/hbaseserver -->
    <dependency>
    <groupId>org.apache.hbase</groupId>
    <artifactId>hbase-server</artifactId>
    <version>1.2.0-cdh5.14.0</version>
    </dependency>
    <dependency>
    <groupId>org.apache.flink</groupId>
    <artifactId>flink-table_${scala.binary.version}</artifactId>
    <version>${flink.version}</version>
    </dependency>
    <dependency>
    <groupId>org.apache.flink</groupId>
    <artifactId>flink-scala_${scala.binary.version}</artifactId>
    <version>${flink.version}</version>
    </dependency>
    <dependency><groupId>org.apache.flink</groupId>
    <artifactId>flink-streaming-scala_${scala.binary.version}
    </artifactId>
    <version>${flink.version}</version>
    </dependency>
    <dependency>
    <groupId>org.apache.flink</groupId>
    <artifactId>flink-streaming-java_${scala.binary.version}
    </artifactId>
    <version>${flink.version}</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoopcommon -->
    <dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-common</artifactId>
    <version>${hadoop.version}-${cdh.version}</version>
    </dependency>
    <dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-hdfs</artifactId>
    <version>${hadoop.version}-${cdh.version}</version>
    </dependency>
    <dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-client</artifactId>
    <version>${hadoop.version}</version>
    <exclusions>
    <exclusion>
    <groupId>com.google.protobuf</groupId>
    <artifactId>protobuf-java</artifactId>
    </exclusion>
    </exclusions>
    </dependency>
    <dependency>
    <groupId>org.apache.flink</groupId>
    <artifactId>flink-hbase_2.11</artifactId>
    <version>${flink.version}</version>
    </dependency>
    </dependencies>
    <!--⽣生产环境-->
    <profiles>
    <profile>
    <id>dev</id>
    <activation>
    <activeByDefault>true</activeByDefault><property>
    <name>env</name>
    <value>Dev</value>
    </property>
    </activation>
    <build>
    <resources>
    <resource>
    <directory>src/main/resources/dev</directory>
    </resource>
    </resources>
    </build>
    </profile>
    <!--开发环境-->
    <profile>
    <id>pro</id>
    <activation>
    <property>
    <name>env</name>
    <value>pro</value>
    </property>
    </activation>
    <build>
    <resources>
    <resource>
    <directory>src/main/resources/pro</directory>
    </resource>
    </resources>
    </build>
    </profile>
    <!--测试环境-->
    <profile>
    <id>test</id>
    <activation>
    <property>
    <name>env</name>
    <value>test</value>
    </property>
    </activation>
    <build>
    <resources>
    <resource>
    <directory>src/main/resources/test</directory>
    </resource>
    </resources>
    </build>
    </profile>
    </profiles>
    </project>
    
    1. 添加生产、开发、测试环境

    resources文件

    生产:dev

    开发:pro

    测试:test

    1. 添加配置文件application.conf
    #kafka配置
    bootstrap.servers="hadoop01:9092,hadoop02:9092,hadoop03:9092"
    zookeeper.connect="hadoop01:2181,hadoop02:2181,hadoop03:2181"
    input.topic="test"
    group.id="test"
    #hbase
    hbase.master="hadoop01:60000"
    hbase.zookeeper.quorum="hadoop01:2181,hadoop02:2181,hadoop03:2181"
    hbase.zookeeper.property.clientPort="2181"
    hbase.rpc.timeout="60000"
    hbase.client.operation.timeout="20000"
    hbase.client.scanner.timeout.period="200000"
    
    1. 开发获取配置文件的工具类
    package com.ityouxin.tools
    import com.typesafe.config.ConfigFactory
    object GlobalConfigUtils {
      //使用ConfigFactory加载资源文件
      private val conf = ConfigFactory.load()
      //获取资源文件配置
      def bootstrapServers:String = conf.getString("bootstrap.servers")
      def zookeeperConnect:String = conf.getString("zookeeper.connect")
      def inputTopic:String = conf.getString("input.topic")
      def groupId:String = conf.getString("group.id")
      def autoCommit:String = conf.getString("enable.auto.commit")
      def autoCommitInterval:String = conf.getString("auto.commit.interval.ms")
      def autoOffsetReset:String = conf.getString("auto.offset.reset")
      def zookeeperQuorum:String = conf.getString("hbase.zookeeper.quorum")
      def hbaseMaster:String = conf.getString("hbase.master")
      def clientPort:String = conf.getString("hbase.zookeeper.property.clientPort")
      def rpcTimeout:String = conf.getString("hbase.rpc.timeout")
      def operatorTimeout:String = conf.getString("hbase.client.operator.timeout")
      def timeoutPeriod:String = conf.getString("hbase.client.scanner.timeout.period")
      def maxDelayTime:Long = conf.getString("maxDelayTime").toLong
      def timeWindow:Long = conf.getString("timeWindow").toLong
    }
    
    
    1. 问题与分析

    1、在代码开发之前,必须要考虑如下⼏几点:

    ​ 1:如何确保数据在处理理过程中的安全性—防丢失
    ​ 2:如果⽹网络出现延迟情况,应该怎么解决(⽹网络延迟,很坑批次1的数据在批次N中处理理,导致数据理理过程中⽆无法做到实时的⼀一致性)

    2、解决问题(数据丢失)

    ​ 在flink开发中,上述问题是⾮非常容易易解决的:
    checkpoint机制是Flink可靠性的基⽯石,可以保证Flink集群在某个算⼦子因为某些原因(如 异常退出)出现故障时,能够将整个应⽤用流图的状态恢复到故障之前的某⼀一状态,保 证应⽤用流图状态的⼀一致性。 Flink的checkpoint机制原理理来⾃自“Chandy-Lamport algorithm”算法。
    ​ 每个需要checkpoint的应⽤用在启动时, Flink的JobManager为其创建⼀一个CheckpointCoordinator,CheckpointCoordinator全权负责本应⽤用的快照制作

    1. CheckpointCoordinator周期性的向该流应⽤用的所有source算⼦子发送barrier。
    2.当某个source算⼦子收到⼀一个barrier时,便便暂停数据处理理过程,然后将⾃自⼰己的当前状 态制作成快
    照,并保存到指定的持久化存储中,最后向CheckpointCoordinator报告 ⾃自⼰己快照制作情况,同
    时向⾃自身所有下游算⼦子⼴广播该barrier,恢复数据处理理
    3.下游算⼦子收到barrier之后,会暂停⾃自⼰己的数据处理理过程,然后将⾃自身的相关状态制作成快照,并
    保存到指定的持久化存储中,最后向CheckpointCoordinator报告⾃自身 快照情况,同时向⾃自身所
    有下游算⼦子⼴广播该barrier,恢复数据处理理。
    4. 每个算⼦子按照步骤3不不断制作快照并向下游⼴广播,直到最后barrier传递到sink算⼦子,快照制作完
    成。
    5. 当CheckpointCoordinator收到所有算⼦子的报告之后,认为该周期的快照制作成功; 否则,如
    果在规定的时间内没有收到所有算⼦子的报告,则认为本周期快照制作失败
    6. ⽬目前, Checkpoint持久化存储可以使⽤用如下三种:
    MemStateBackend
    FsStateBackend
    RocksDBStateBackend
    
  2. 解决问题(网络延迟)

    源源不不断的数据流是⽆无法进⾏行行统计⼯工作的,因为数据流没有边界,就⽆无法统计到底有多少数据经过了了这个流。也⽆无法统计数据流中的最⼤大值,最⼩小值,平均值,累加值等信息。
    如果在数据流上,截取固定⼤大⼩小的⼀一部分,这部分是可以进⾏行行统计的。 截取⽅方式主要有两种,

    1.现实世界中的时间是不不⼀一致的,在flink中被划分为事件时间,提取时间,处理理时间三种。
    2.如果以EventTime为基准来定义时间窗⼝口那将形成EventTimeWindow,要求消息本身就应该携带
    EventTime
    2.如果以IngesingtTime为基准来定义时间窗⼝口那将形成IngestingTimeWindow,以source的
    systemTime为准。
    2.如果以ProcessingTime基准来定义时间窗⼝口那将形成ProcessingTimeWindow,以operator
    的systemTime为准。
    对于flink最初设计的时候,就考虑到了了⽹网络延迟,⽹网络乱序等问题,所以提出了了⼀一个抽象概念基座⽔水印(WaterMark);
    
  3. APP驱动类开发

    package com.ityouxin.process
    
    import java.util.Properties
    
    import org.apache.flink.streaming.api.watermark.Watermark
    import com.alibaba.fastjson.{JSON, JSONObject}
    import com.ityouxin.process.bean.{Message, UserBrowse}
    import com.ityouxin.process.task.ChannelRealHotTask
    import com.ityouxin.tools.GlobalConfigUtils
    import org.apache.flink.api.common.serialization.SimpleStringSchema
    import org.apache.flink.streaming.api.TimeCharacteristic
    import org.apache.flink.streaming.api.functions.AssignerWithPeriodicWatermarks
    import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
    import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer09
    import org.apache.htrace.fasterxml.jackson.databind.module.SimpleSerializers
    import org.apache.kafka.clients.consumer.ConsumerConfig
    import org.apache.flink.streaming.api.scala._
    object App {
    
    
    
      def main(args: Array[String]): Unit = {
        //初始化senv
        val senv: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
        //开启flink窗口,延迟加载数据
        senv.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
        senv.setParallelism(1)
        //使用flink来拉取kafka的数据
        //加载kfaka的配置
        val props: Properties = new Properties()
        props.setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG,GlobalConfigUtils.bootstrapServers)
        props.setProperty(ConsumerConfig.GROUP_ID_CONFIG,GlobalConfigUtils.groupId)
        props.setProperty(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG,"org.apache.kafka.common.serialization.StringSerializer")
        props.setProperty(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,"org.apache.kafka.common.serialization.StringSerializer")
        props.setProperty(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG,GlobalConfigUtils.autoCommit)
        props.setProperty(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG,GlobalConfigUtils.autoCommitInterval)
        props.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,GlobalConfigUtils.autoOffsetReset)
        val kafkaSource = new FlinkKafkaConsumer09[String](GlobalConfigUtils.inputTopic,new SimpleStringSchema(),props)
        //读取kafka的数据,加载成DS
        val logDataStream: DataStream[String] = senv.addSource(kafkaSource)
        //将消息体转换为一个对象
        val messageDS: DataStream[Message] = logDataStream.map(
          log => {
            println(log)
            val jSONObject: JSONObject = JSON.parseObject(log)
            val count = jSONObject.get("count").toString.toInt
            val message: String = jSONObject.get("message").toString
            val timestamp: Long = jSONObject.get("timestamp").toString.toLong
            Message(UserBrowse.toBean(message), count, timestamp)
          }
        )
    
        //加入水印,延迟加载
        val watermarkDS: DataStream[Message] = messageDS.assignTimestampsAndWatermarks(new AssignerWithPeriodicWatermarks[Message]{
          var currentMaxTimestamp = 0L
          var maxDelayTime = GlobalConfigUtils.maxDelayTime
          override def getCurrentWatermark:Watermark = {
            new Watermark(currentMaxTimestamp - maxDelayTime)
          }
    
          override def extractTimestamp(element: Message, previousElementTimestamp: Long) = {
            //取出最大的时间戳
            val timestamp:Long =element.timestamp
            //比较当前时间错和最大的时间戳
            currentMaxTimestamp = Math.max(timestamp,currentMaxTimestamp)
            timestamp
          }
        })
        //业务处理
        //1、实时频道热点
        ChannelRealHotTask.process(watermarkDS)
      }
    }
    
    
  4. 业务一:处理理实时频道热点

    所谓实时频道热点,就是不不同频道,在规定的时间内的点击量量

    1. task
    package com.ityouxin.process.task
    
    import com.ityouxin.process.bean.{ChannelRealHot, Message}
    import com.ityouxin.process.inerface.DataProcess
    import com.ityouxin.process.map.RealHostMap
    import com.ityouxin.process.reduce.ChannelRealHotReduce
    import com.ityouxin.process.sink.ChannelRealHotSink
    import com.ityouxin.tools.GlobalConfigUtils
    import org.apache.flink.streaming.api.scala.DataStream
    import org.apache.flink.streaming.api.scala._
    import org.apache.flink.streaming.api.windowing.time.Time
    import org.apache.flink.streaming.api.windowing.windows.TimeWindow
    //业务1—处理理实时频道热点
    //使用接口来进行任务的调用
    object ChannelRealHotTask extends DataProcess{
      //实现 实时频道热点分析
      override def process(ds: DataStream[Message]): Unit = {
        //转换
        val flatMapDS: DataStream[ChannelRealHot] = ds.flatMap(new RealHostMap)
        //按照频道ID进行 分流
        val keyByedDS: KeyedStream[ChannelRealHot, Long] = flatMapDS.keyBy(item => item.channelID)
        //时间窗口
        val windowedDS: WindowedStream[ChannelRealHot, Long, TimeWindow] = keyByedDS.timeWindow(Time.seconds(GlobalConfigUtils.timeWindow))
        //对频道的点击量进行统计
        val reduceDS: DataStream[ChannelRealHot] = windowedDS.reduce(new 
      }
    }
    2. bean
    package com.ityouxin.process.bean
    
    case class ChannelRealHot(
                               channelID:Long,count:Long)
    
    package com.ityouxin.process.bean
    
    case class Message(
                        userBrowse: UserBrowse,count: Int,timestamp:Long)
    
    
    3. interface
    package com.ityouxin.process.inerface
    
    import com.ityouxin.process.bean.Message
    import org.apache.flink.streaming.api.scala.DataStream
    
    trait DataProcess {
      def process(ds:DataStream[Message])
    
    }
    
    4.flatmap收集
    package com.ityouxin.process.map
    
    import com.ityouxin.process.bean.{ChannelRealHot, Message}
    import org.apache.flink.api.common.functions.FlatMapFunction
    import org.apache.flink.util.Collector
    
    class RealHostMap extends FlatMapFunction[Message,ChannelRealHot]{
      override def flatMap(value: Message, out: Collector[ChannelRealHot]): Unit = {
        //得到用户浏览样例类的
        val channelId:Long = value.userBrowse.channelId
        //收集频道ID
        out.collect(ChannelRealHot(channelId,1))
      }
    }
    
    5. reduce聚合
    package com.ityouxin.process.reduce
    
    import com.ityouxin.process.bean.ChannelRealHot
    import org.apache.flink.api.common.functions.ReduceFunction
    
    class ChannelRealHotReduce extends ReduceFunction[ChannelRealHot]{
      //实现频道的聚合,对频道进行统计
      override def reduce(t: ChannelRealHot, t1: ChannelRealHot): ChannelRealHot = {
        ChannelRealHot(t.channelID,t.count+t1.count)
      }
    }
    
    6.sink下沉
    package com.ityouxin.process.sink
    
    import com.ityouxin.process.bean.ChannelRealHot
    import com.ityouxin.tools.HbaseUtils
    import org.apache.commons.lang.StringUtils
    import org.apache.flink.streaming.api.functions.sink.SinkFunction
    import org.apache.flink.table.descriptors.HBase
    import org.apache.hadoop.hbase.TableName
    
    import scala.collection.mutable
    //
    class ChannelRealHotSink extends SinkFunction[ChannelRealHot]{
      override def invoke(channelRealHot: ChannelRealHot,connect:SinkFunction.Context[_]): Unit = {
        //输出
        //得到频道id和统计频道id的个数
        val channelID:Long = channelRealHot.channelID
        var count: Long = channelRealHot.count
        val tabChannel: TableName = TableName.valueOf("channel" )
        val rowKey = channelID
        val columnFamily = "info"
        val column = "count"
        //获取数据
        val value: String = HbaseUtils.getTablec(tabChannel,rowKey.toString,columnFamily,column)
        //判断是否为空或
        if(StringUtils.isNotEmpty(value)){
          count += value.toLong
        }
        //容器   存放表的数据信息
        val fieldsData = new mutable.HashMap[String,String]()
        fieldsData.put(column,count.toString)
        //写入Hbase
        HbaseUtils.putData(tabChannel,rowKey.toString,columnFamily,fieldsData)
    
      }
    }
    
    
  5. 业务二:处理理频道的PV、 UV
    PV(访问量量):即Page View,⻚页⾯面刷新⼀一次算⼀一次。
    UV(独⽴立访客):即Unique Visitor, 00:00-24:00内相同的客户端只被计算⼀一次

    1. task
    package com.ityouxin.process.task
    
    import com.ityouxin.process.bean.{ChannelPVUV, Message}
    import com.ityouxin.process.inerface.DataProcess
    import com.ityouxin.process.map.ChannelPVUVMap
    import com.ityouxin.process.reduce.ChannelPVUVReduce
    import com.ityouxin.process.sink.ChannelPVUVSink
    import com.ityouxin.tools.GlobalConfigUtils
    import org.apache.flink.streaming.api.scala.DataStream
    import org.apache.flink.streaming.api.windowing.time.Time
    import org.apache.flink.streaming.api.scala._
    import org.apache.flink.streaming.api.windowing.windows.TimeWindow
    //业务2:处理理频道的PV、 UV
    object ChannelPVUVTask extends DataProcess{
      override def process(ds: DataStream[Message]): Unit = {
        //将消息流转化为组装业务bean
        val flatedMapDS: DataStream[ChannelPVUV] = ds.flatMap(new ChannelPVUVMap)
        //根据各个维度的key进行分流 (频道和 小时 天  月 组合成复合主键进行分流)
        val keyedS: KeyedStream[ChannelPVUV, String] = flatedMapDS.keyBy(item => item.channelID + item.dateField)
        //时间窗口
        val window: WindowedStream[ChannelPVUV, String, TimeWindow] = keyedS.timeWindow(Time.seconds(GlobalConfigUtils.timeWindow))
        //按频道和各时间维度聚合
        val reslut: DataStream[ChannelPVUV] = window.reduce(new ChannelPVUVReduce)
    
        //输出
        reslut.addSink(new ChannelPVUVSink)
      }
    }
    
    2. bean
    package com.ityouxin.process.bean
    
    case class ChannelPVUV (  channelID:Long,
                              userID:Long,
                              timestamp:Long,
                              pv:Long,
                              uv:Long,
                              dateField:String)
    package com.ityouxin.process.bean
    
    case class ChannelRegion (
                               channelID:Long,
                               country:String,
                               province:String,
                               city:String,
                               pv:Long=0L,
                               uv:Long=0,
                               newCount:Long=0,
                               oldCount:Long=0,
                               timestamp:Long,
                               userId:Long,
                               dateField:String
                             ) {
    
      def getArea(): String = {
        country + "_" + province + "_" + city + "_"
      }
    }
    
    
    3. map
    package com.ityouxin.process.map
    
    import com.ityouxin.process.bean.{ChannelRegion, Message, UserState}
    import com.ityouxin.tools.TimeUtils
    import org.apache.flink.api.common.functions.FlatMapFunction
    import org.apache.flink.util.Collector
    
    class ChannelRegionMap extends FlatMapFunction[Message,ChannelRegion]{
      override def flatMap(value: Message, out: Collector[ChannelRegion]): Unit = {]
        //字段
        val timestamp = value.userBrowse.entryTime
        val hour = TimeUtils.getDate(timestamp,"yyyyMMddHH")
        val day = TimeUtils.getDate(timestamp,"yyyyMMdd")
        val month = TimeUtils.getDate(timestamp,"yyyyMM")
        val userId = value.userBrowse.userId
        val channelId =value.userBrowse.channelId
        //
        val country = value.userBrowse.country
        val province =value.userBrowse.province
        val city = value.userBrowse.city
        //取出用户的状态信息
        val state: UserState = UserState.getUserState(userId.toString,timestamp)
        val isNew: Boolean = state.isNew
        val isFirstHour: Boolean = state.isFirstHour
        val isFirstDay: Boolean = state.isFirstDay
        val isFirstMonth: Boolean = state.isFirstMonth
        //设置pv和不同维度的UV
        var pv:Long=1L
        var hourUV:Long=0L
        var dayUV:Long=0L
        var monthUV:Long=0L
    
        //设置 和判断
        //映射成 新老用户
        var newCount =0L
        //小时的 oldCount
        var hourOldCount=0L
        //天的 oldCount
        var dayOldCount=0L
        //月的 oldCount
        var monthOldCount=0L
        if(isNew){
          newCount=1L
        }else{
          if(isFirstHour){
            hourOldCount=1L
            hourUV=1L
          }
          if(isFirstDay){
            dayOldCount=1L
            dayUV=1L
          }
          if(isFirstMonth){
            monthOldCount=1L
            monthUV=1L
          }
        }
        //返回三个时间维度的数据
        out.collect(ChannelRegion(channelId,country,province,city,pv,hourUV,newCount,hourOldCount,timestamp,userId,hour))
        out.collect(ChannelRegion(channelId,country,province,city,pv,dayUV,newCount,dayOldCount,timestamp,userId,day))
        out.collect(ChannelRegion(channelId,country,province,city,pv,monthUV,newCount,monthOldCount,timestamp,userId,month))
      }
    }
    
    3. reduce
    package com.ityouxin.process.reduce
    
    import com.ityouxin.process.bean.ChannelPVUV
    import org.apache.flink.api.common.functions.ReduceFunction
    
    class ChannelPVUVReduce extends ReduceFunction[ChannelPVUV]{
      override def reduce(value1: ChannelPVUV, value2: ChannelPVUV): ChannelPVUV = {
        //
        ChannelPVUV(
          value1.channelID, // 频道
          value1.userID,
          value1.timestamp,
          value1.pv+value2.pv,// PV 聚合相加
          value1.uv+value2.uv,// UV 聚合相加
          value1.dateField) // 小时| 天| 月 维度
    
      }
    }
    
    
    4.sink
    package com.ityouxin.process.sink
    
    import com.ityouxin.process.bean.ChannelPVUV
    import com.ityouxin.tools.HbaseUtils
    import org.apache.flink.streaming.api.functions.sink.SinkFunction
    import org.apache.hadoop.hbase.TableName
    
    class ChannelPVUVSink extends SinkFunction[ChannelPVUV]{
      override def invoke(value: ChannelPVUV, context: SinkFunction.Context[_]): Unit = {
        //表名
        val tableName: TableName = TableName.valueOf("channelPVUV")
        //列族
        val columnFamily = "info"
        //列
      val rowKey =  HbaseUtils.lpad(value.channelID.toString,3)+ ":" + value.dateField
       //PV UV
       val pvData: String = HbaseUtils.getTablec(tableName, rowKey, columnFamily, "PV")
        val uvData: String = HbaseUtils.getTablec(tableName, rowKey, columnFamily, "UV")
      }
    }
    
    
  6. 业务3:处理理频道的新鲜度

    处理数据返回新鲜实体类

    1. task
    package com.ityouxin.process.task
    
    import com.ityouxin.process.bean.{ChannelUserFreshness, Message}
    import com.ityouxin.process.inerface.DataProcess
    import com.ityouxin.process.map.ChannelUserFreshnessMap
    import com.ityouxin.process.reduce.ChannelUserFreshnessReduce
    import com.ityouxin.process.sink.ChannelUserFreshnessSink
    import com.ityouxin.tools.GlobalConfigUtils
    import org.apache.flink.streaming.api.scala.DataStream
    import org.apache.flink.streaming.api.scala._
    import org.apache.flink.streaming.api.windowing.time.Time
    import org.apache.flink.streaming.api.windowing.windows.TimeWindow
    
    //业务三:处理理数据返回新鲜度实体类
    object ChannelUserFreshnessTask extends DataProcess{
      override def process(ds: DataStream[Message]): Unit = {
        //需要将Message转化为业务bean
        val flarMapedDS: DataStream[ChannelUserFreshness] = ds.flatMap(new ChannelUserFreshnessMap)
    //数据的分流key(频道的小时 天 月 的维度分流)
        val keyedS: KeyedStream[ChannelUserFreshness, String] = flarMapedDS.keyBy(item=>item.channelID)
        //时间窗口划分
        val window: WindowedStream[ChannelUserFreshness, String, TimeWindow] = keyedS.timeWindow(Time.seconds(GlobalConfigUtils.timeWindow))
        //窗口内数据聚合
        val resoult: DataStream[ChannelUserFreshness] = window.reduce(new ChannelUserFreshnessReduce)
      //写入sink
        resoult.addSink(new ChannelUserFreshnessSink)
      }
    }
    
    2. bean
    package com.ityouxin.process.bean
    
    case class ChannelUserFreshness (
      channelID:String,
      timestamp:Long,
      userId:Long,
      newCount:Long=0,
      oldCount:Long=0,
      dateField:String
                                    )
    
    3.map
    package com.ityouxin.process.map
    
    import com.ityouxin.process.bean.{ChannelUserFreshness, Message, UserState}
    import com.ityouxin.tools.TimeUtils
    import org.apache.flink.api.common.functions.FlatMapFunction
    import org.apache.flink.util.Collector
    
    class ChannelUserFreshnessMap extends FlatMapFunction[Message,ChannelUserFreshness]{
      override def flatMap(value: Message, out: Collector[ChannelUserFreshness]): Unit = {
        //获取字段信息
        val timestamp: Long = value.userBrowse.entryTime
        val hour= TimeUtils.getDate(timestamp,"yyyyMMddHH")
        val day = TimeUtils.getDate(timestamp,"yyyyMMdd")
        val month = TimeUtils.getDate(timestamp,"yyyyMM")
        val userId = value.userBrowse.userId
        val channelID = value.userBrowse.channelId
        //分析用户的状态
        val state: UserState = UserState.getUserState(userId.toString, timestamp)
        val isNew = state.isNew
        val isFirstHour = state.isFirstHour
        val isFirstDay = state.isFirstDay
        val isFirstMonth = state.isFirstMonth
    
        //映射为新用户
        var newCount = 0L
        //小时的oldCount
        var hourOldCount =0L
        //天的oldCount
        var dayOldCount = 0L
        //月的
        var monthOldCount = 0L
        //判断是否是新用户
        if (isNew){
          newCount=1L
        }else{
          if (isFirstHour){
            hourOldCount=1L
          }
          if (isFirstDay){
            dayOldCount=1L
          }
          if (isFirstMonth){
            monthOldCount=1L
          }
        }
        //注意返回三个维度的ChannelUserFreshness
        out.collect(ChannelUserFreshness(channelID.toString ,timestamp ,userId,newCount,hourOldCount,hour))
        out.collect(ChannelUserFreshness(channelID.toString ,timestamp,userId,newCount,dayOldCount,day))
        out.collect(ChannelUserFreshness(channelID.toString ,timestamp,userId,newCount,monthOldCount,month))
    
      }
    }
    
    4. reduce
    package com.ityouxin.process.reduce
    
    import com.ityouxin.process.bean.{ChannelUserFreshness, Message, UserState}
    import com.ityouxin.tools.TimeUtils
    import org.apache.flink.api.common.functions.ReduceFunction
    
    class ChannelUserFreshnessReduce extends ReduceFunction[ChannelUserFreshness]{
      override def reduce(value1: ChannelUserFreshness, value2: ChannelUserFreshness): ChannelUserFreshness = {
        ChannelUserFreshness(
          value1.channelID,
          value1.timestamp,
          value1.userId,
          value1.newCount+value2.newCount,
          value1.oldCount+value2.oldCount,
          value1.dateField
        )
      }
    }
    
    5. sink
    package com.ityouxin.process.sink
    
    import com.ityouxin.process.bean.ChannelUserFreshness
    import com.ityouxin.tools.HbaseUtils
    import org.apache.commons.lang.StringUtils
    import org.apache.flink.streaming.api.functions.sink.SinkFunction
    import org.apache.hadoop.hbase.TableName
    
    import scala.collection.mutable
    
    class ChannelUserFreshnessSink extends SinkFunction[ChannelUserFreshness]{
      override def invoke(value: ChannelUserFreshness, context: SinkFunction.Context[_]): Unit = {
        //获取表信息
        val tableName = TableName.valueOf("channelPVUV")
        val columnFamily="info"
        val newCount:String="newCount"
        val oldCount:String="oldCount"
        val rowKey= HbaseUtils.lpad(value.channelID,3) +":"+value.dateField
        //查询历史数据
        val newCountData: String = HbaseUtils.getTablec(tableName,rowKey,columnFamily,newCount)
        val oldCountData:String = HbaseUtils.getTablec(tableName,rowKey,columnFamily,oldCount)
        //取出新的数据
        var newCountV = value.newCount
        var oldCountV = value.oldCount
        //判断新数据是否为空
        if(StringUtils.isNotBlank(newCountData)){
          //+
          newCountV +=newCountData.toLong
        }
        if(StringUtils.isNotBlank(oldCountData)){
          oldCountV += oldCountData.toLong
        }
        //写入Hbase
        val map = new mutable.HashMap[String,String]()
        map.put(newCount,newCountV.toString)
        map.put(oldCount,oldCountV.toString)
        HbaseUtils.putData(tableName,rowKey,columnFamily,map)
      }
    }
    
  7. 业务4:处理理频道的地域

    1. task
    package com.ityouxin.process.task
    
    import com.ityouxin.process.bean.{ChannelRegion, Message}
    import com.ityouxin.process.inerface.DataProcess
    import com.ityouxin.process.map.ChannelRegionMap
    import com.ityouxin.process.reduce.ChannelRegionReduce
    import com.ityouxin.process.sink.ChannelRegionSink
    import com.ityouxin.tools.GlobalConfigUtils
    import org.apache.flink.streaming.api.scala.DataStream
    import org.apache.flink.streaming.api.scala._
    import org.apache.flink.streaming.api.windowing.time.Time
    import org.apache.flink.streaming.api.windowing.windows.TimeWindow
    //业务4:处理理频道的地域
    object ChannelRegionTask extends DataProcess{
      override def process(ds: DataStream[Message]): Unit = {
        //转化Message
        val flatedDS: DataStream[ChannelRegion] = ds.flatMap(new ChannelRegionMap)
        //分流key 格式(分流的key:channelID,dateField,area(country,province,city))
        val keyedStream: KeyedStream[ChannelRegion, String] = flatedDS.keyBy(
          item =>
            item.channelID + item.dateField + item.country + item.province + item.city
        )
        //划分时间窗口
        val window: WindowedStream[ChannelRegion, String, TimeWindow] = keyedStream.timeWindow(Time.seconds(GlobalConfigUtils.timeWindow))
        //窗口聚合
        val reduceDS: DataStream[ChannelRegion] = window.reduce(new ChannelRegionReduce)
        //写入hbase
        reduceDS.addSink(new ChannelRegionSink)
      }
    }
    
    2. bean
    package com.ityouxin.process.bean
    
    case class ChannelRegion (
                               channelID:Long,
                               country:String,
                               province:String,
                               city:String,
                               pv:Long=0L,
                               uv:Long=0,
                               newCount:Long=0,
                               oldCount:Long=0,
                               timestamp:Long,
                               userId:Long,
                               dateField:String
                             ) {
    
      def getArea(): String = {
        country + "_" + province + "_" + city + "_"
      }
    }
    
    3. map
    package com.ityouxin.process.map
    
    import com.ityouxin.process.bean.{ChannelRegion, Message, UserState}
    import com.ityouxin.tools.TimeUtils
    import org.apache.flink.api.common.functions.FlatMapFunction
    import org.apache.flink.util.Collector
    
    class ChannelRegionMap extends FlatMapFunction[Message,ChannelRegion]{
      override def flatMap(value: Message, out: Collector[ChannelRegion]): Unit = {]
        //字段
        val timestamp = value.userBrowse.entryTime
        val hour = TimeUtils.getDate(timestamp,"yyyyMMddHH")
        val day = TimeUtils.getDate(timestamp,"yyyyMMdd")
        val month = TimeUtils.getDate(timestamp,"yyyyMM")
        val userId = value.userBrowse.userId
        val channelId =value.userBrowse.channelId
        //
        val country = value.userBrowse.country
        val province =value.userBrowse.province
        val city = value.userBrowse.city
        //取出用户的状态信息
        val state: UserState = UserState.getUserState(userId.toString,timestamp)
        val isNew: Boolean = state.isNew
        val isFirstHour: Boolean = state.isFirstHour
        val isFirstDay: Boolean = state.isFirstDay
        val isFirstMonth: Boolean = state.isFirstMonth
        //设置pv和不同维度的UV
        var pv:Long=1L
        var hourUV:Long=0L
        var dayUV:Long=0L
        var monthUV:Long=0L
    
        //设置 和判断
        //映射成 新老用户
        var newCount =0L
        //小时的 oldCount
        var hourOldCount=0L
        //天的 oldCount
        var dayOldCount=0L
        //月的 oldCount
        var monthOldCount=0L
        if(isNew){
          newCount=1L
        }else{
          if(isFirstHour){
            hourOldCount=1L
            hourUV=1L
          }
          if(isFirstDay){
            dayOldCount=1L
            dayUV=1L
          }
          if(isFirstMonth){
            monthOldCount=1L
            monthUV=1L
          }
        }
        //返回三个时间维度的数据
        out.collect(ChannelRegion(channelId,country,province,city,pv,hourUV,newCount,hourOldCount,timestamp,userId,hour))
        out.collect(ChannelRegion(channelId,country,province,city,pv,dayUV,newCount,dayOldCount,timestamp,userId,day))
        out.collect(ChannelRegion(channelId,country,province,city,pv,monthUV,newCount,monthOldCount,timestamp,userId,month))
      }
    }
    
    4. reduce
    package com.ityouxin.process.reduce
    
    import com.ityouxin.process.bean.ChannelRegion
    import org.apache.flink.api.common.functions.ReduceFunction
    
    class ChannelRegionReduce extends ReduceFunction[ChannelRegion]{
      override def reduce(value1: ChannelRegion, value2: ChannelRegion): ChannelRegion = {
        ChannelRegion(
          value1.channelID,
          value1.country,
          value1.province,
          value1.city,
          value1.pv+value2.pv,
          value1.uv+value2.uv,
          value1.newCount+value2.newCount,
          value1.oldCount+value2.oldCount,
          value1.timestamp,
          value1.userId,
          value1.dateField
        )
      }
    }
    
    5.sink
    package com.ityouxin.process.sink
    
    import com.ityouxin.process.bean.ChannelRegion
    import com.ityouxin.tools.HbaseUtils
    import org.apache.commons.lang.StringUtils
    import org.apache.flink.streaming.api.functions.sink.SinkFunction
    import org.apache.hadoop.hbase.TableName
    
    import scala.collection.mutable
    
    class ChannelRegionSink extends SinkFunction[ChannelRegion]{
      override def invoke(value: ChannelRegion, context: SinkFunction.Context[_]): Unit = {
        //Table
        val tableName = TableName.valueOf("channelPVUV")
        val columnFamily="info"
        val regionPV= value.getArea()+"PV"
        val regionUV = value.getArea()+ "UV"
        val regionNewCount =value.getArea()+ "newCount"
        val regionOldCount = value.getArea()+ "oldCount"
        val rowKey = HbaseUtils.lpad(value.channelID.toString,3)+":"+value.dateField
    
        var regionPVV = value.pv
        var regionUVV = value.uv
        var regionNewCountV= value.newCount
        var regionOldCountV = value.oldCount
    
        //取历史数据
        val pvData = HbaseUtils.getTablec(tableName,rowKey,columnFamily,regionPV)
        val uvData = HbaseUtils.getTablec(tableName,rowKey,columnFamily,regionUV)
        val newCountData =HbaseUtils.getTablec(tableName,rowKey,columnFamily,regionNewCount)
        val oldCountData=HbaseUtils.getTablec(tableName,rowKey,columnFamily,regionOldCount)
    
        if(StringUtils.isNotBlank(pvData)){
          regionPVV += pvData.toLong
        }
    
        if(StringUtils.isNotBlank(uvData)){
          regionUVV+=uvData.toLong
        }
        if(StringUtils.isNotBlank(newCountData)){
          regionNewCountV+=newCountData.toLong
        }
        if(StringUtils.isNotBlank(oldCountData)){
          regionOldCountV+=oldCountData.toLong
        }
    
        //需要写入的字段
        val map=new mutable.HashMap[String,String]()
        map.put(regionPV,regionPVV.toString)
        map.put(regionUV,regionUVV.toString)
        map.put(regionNewCount,regionNewCountV.toString)
        map.put(regionOldCount,regionOldCountV.toString)
    
        HbaseUtils.putData(tableName,rowKey,columnFamily,map)
      }
    }
    
  8. 业务5:处理理运营商平台指标

    1.task
    package com.ityouxin.process.task
    
    import com.ityouxin.process.bean.{ChannelNetWork, Message}
    import com.ityouxin.process.inerface.DataProcess
    import com.ityouxin.process.map.ChannelNetWorkMap
    import com.ityouxin.process.reduce.ChannelNetWorkReduce
    import com.ityouxin.process.sink.ChannelNetWorkSink
    import com.ityouxin.tools.GlobalConfigUtils
    import org.apache.flink.streaming.api.scala.DataStream
    import org.apache.flink.streaming.api.scala._
    import org.apache.flink.streaming.api.windowing.time.Time
    import org.apache.flink.streaming.api.windowing.windows.TimeWindow
    
    //业务5:处理理运营商平台指标
    object ChannelNetWorkTask extends DataProcess{
      override def process(ds: DataStream[Message]): Unit = {
        //转化
        val flatedDS: DataStream[ChannelNetWork] = ds.flatMap(new ChannelNetWorkMap)
        //分流
        val keyedS: KeyedStream[ChannelNetWork, String] = flatedDS.keyBy(item => item.dateField + item.network)
        //窗口
        val window: WindowedStream[ChannelNetWork, String, TimeWindow] = keyedS.timeWindow(Time.seconds(GlobalConfigUtils.timeWindow))
        //结果
        val reduceedDS: DataStream[ChannelNetWork] = window.reduce(new ChannelNetWorkReduce)
        //下沉
        reduceedDS.addSink(new ChannelNetWorkSink)
      }
    }
    
    2. bean
    package com.ityouxin.process.bean
    
    case class ChannelNetWork(
                               network:String,
                               count:Long,
                               newCount:Long,
                               oldCount:Long,
                               timestamp:Long,
                               dateField:String
                        )
    3. map
    package com.ityouxin.process.map
    
    import com.ityouxin.process.bean.{ChannelNetWork, ChannelNetWork, Message, UserState}
    import com.ityouxin.tools.TimeUtils
    import org.apache.flink.api.common.functions.FlatMapFunction
    import org.apache.flink.util.Collector
    
    class ChannelNetWorkMap extends FlatMapFunction[Message,ChannelNetWork]{
      override def flatMap(value: Message, out: Collector[ChannelNetWork]): Unit = {
        //获取时间维度和用户state
        val timestamp = value.userBrowse.entryTime
        val hour = TimeUtils.getDate(timestamp,"yyyyMMddHH")
        val day = TimeUtils.getDate(timestamp,"yyyyMMdd")
        val month = TimeUtils.getDate(timestamp,"yyyyMM")
    
        val userID = value.userBrowse.userId
        val netWork= value.userBrowse.network
    
        val state: UserState = UserState.getUserState(userID.toString, timestamp)
        val isNew = state.isNew
        val isFirstHour = state.isFirstHour
        val isFirstDay = state.isFirstDay
        val isFirstMonth =state.isFirstMonth
        val count:Long = 1L
        var newCount =0L
        var hourOldCount =0L
        var dayOldCount =0L
        var monthOldCount =0L
    
        if(isNew){
          newCount =1L
        }else{
          if(isFirstHour){
            hourOldCount=1L
          }
          if(isFirstDay){
            dayOldCount=1L
          }
          if(isFirstMonth){
            monthOldCount=1L
          }
        }
    
        //映射成三个维度的业务数据
        out.collect( ChannelNetWork(netWork,count,newCount,hourOldCount,timestamp,hour))
        out.collect( ChannelNetWork(netWork,count,newCount,dayOldCount,timestamp,day))
        out.collect( ChannelNetWork(netWork,count,newCount,monthOldCount,timestamp,month))
      }
    }
    
    4. reduce
    package com.ityouxin.process.reduce
    
    import com.ityouxin.process.bean.ChannelNetWork
    import org.apache.flink.api.common.functions.ReduceFunction
    
    class ChannelNetWorkReduce extends ReduceFunction[ChannelNetWork]{
      override def reduce(value1: ChannelNetWork, value2: ChannelNetWork): ChannelNetWork = {
        ChannelNetWork(
          value1.network,
          value1.count+value2.count,
          value1.newCount+value2.newCount,
          value1.oldCount+value2.oldCount,
          value1.timestamp ,
          value1.dateField
        )
      }
    }
    
    5. sink
    package com.ityouxin.process.sink
    
    import com.ityouxin.process.bean.ChannelNetWork
    import com.ityouxin.tools.HbaseUtils
    import org.apache.commons.lang.StringUtils
    import org.apache.flink.streaming.api.functions.sink.SinkFunction
    import org.apache.hadoop.hbase.TableName
    
    import scala.collection.mutable
    
    class ChannelNetWorkSink extends SinkFunction[ChannelNetWork]{
      override def invoke(value: ChannelNetWork, context: SinkFunction.Context[_]): Unit = {
        val network = value.network
        val timestamp = value.timestamp
    
        val tableName = TableName.valueOf("device")
        val columnFamily="info"
        val nwCount = network+"_Count"
        val nwNewCount = network+"_NewCount"
        val nwOldCount = network+"_OldCount"
        val rowKey = value.dateField
    
        //获取历史数据
        val countData = HbaseUtils.getTablec(tableName,rowKey,columnFamily,nwCount)
        val newCountData = HbaseUtils.getTablec(tableName,rowKey,columnFamily,nwNewCount)
        val oldCountData = HbaseUtils.getTablec(tableName,rowKey,columnFamily,nwOldCount)
    
        var count = value.count
        var newCount = value.newCount
        var oldCount = value.oldCount
        if(StringUtils.isNotBlank(countData)){
          count+=countData.toLong
        }
        if(StringUtils.isNotBlank(newCountData)){
          newCount+=newCountData.toLong
        }
        if(StringUtils.isNotBlank(oldCountData)){
          oldCount+=oldCountData.toLong
        }
        val map = new mutable.HashMap[String,String]()
        map.put(nwCount,count.toString)
        map.put(nwNewCount,newCount.toString)
        map.put(nwOldCount,oldCount.toString)
        HbaseUtils.putData(tableName,rowKey,columnFamily,map)
      }
    }
    
    
  9. 业务6:浏览器器类型指标处理理

    1. task
    
    package com.ityouxin.process.task
    
    import com.ityouxin.process.bean.{ChannelBrowser, Message}
    import com.ityouxin.process.inerface.DataProcess
    import com.ityouxin.process.map.ChannelBrowserMap
    import com.ityouxin.process.reduce.ChannelBrowserReduce
    import com.ityouxin.process.sink.ChannelBrowserSink
    import org.apache.flink.streaming.api.scala.DataStream
    import org.apache.flink.streaming.api.scala._
    //业务6:浏览器器类型指标处理理
    object ChannelBrowserTask extends DataProcess{
      override def process(ds: DataStream[Message]): Unit = {
        val flatedDS: DataStream[ChannelBrowser] = ds.flatMap(new ChannelBrowserMap)
        val keyedS: KeyedStream[ChannelBrowser, String] = flatedDS.keyBy(item => item.browser)
        val resoult: DataStream[ChannelBrowser] = keyedS.reduce(new ChannelBrowserReduce)
        resoult.addSink(new ChannelBrowserSink)
      }
    }
    
    2. bean
    package com.ityouxin.process.bean
    
    case class ChannelBrowser (
                                browser:String,
                                count:Long,
                                newCount:Long,
                                oldCount:Long,
                                timestamp:Long,
                                dateField:String
                              )
    package com.ityouxin.process.bean
    
    import com.alibaba.fastjson.{JSON, JSONObject}
    
    case class UserBrowse (
                            channelId:Long,
                            categoryId:Long,
                            productId:Long,
                            country:String,
                            province:String,
                            city:String,
                            network:String,
                            sources:String,
                            browserType:String,
                            entryTime:Long,
                            leaveTime:Long,
                            userId:Long
    )
    object UserBrowse{
      //转换json字符串
      def toBean(message:String)={
        val jSONObject: JSONObject = JSON.parseObject(message)
        val channelId: Long = jSONObject.get("channelId").toString.toLong
        val categoryId: Long = jSONObject.get("categoryId").toString.toLong
        val productId: Long = jSONObject.get("productId").toString.toLong
        val city: String = jSONObject.get("city").toString
        val country: String = jSONObject.get("country").toString
        val province: String = jSONObject.get("province").toString
        val network: String = jSONObject.get("network").toString
        val sources: String = jSONObject.get("sources").toString
        val browserType: String = jSONObject.get("browserType").toString
        val entryTime: Long = jSONObject.get("entryTime").toString.toLong
        val leaveTime: Long = jSONObject.get("leaveTime").toString.toLong
        val userId: Long = jSONObject.get("userId").toString.toLong
    
        UserBrowse(channelId,categoryId,productId,country,province,city,network,sources,browserType,entryTime,leaveTime,userId)
    
      }
    }
    3.map
    package com.ityouxin.process.map
    
    import com.ityouxin.process.bean.{ChannelBrowser, Message, UserState}
    import com.ityouxin.tools.TimeUtils
    import org.apache.flink.api.common.functions.FlatMapFunction
    import org.apache.flink.util.Collector
    
    class ChannelBrowserMap extends FlatMapFunction[Message,ChannelBrowser]{
      override def flatMap(value: Message, out: Collector[ChannelBrowser]): Unit = {
        val timestamp = value.userBrowse.entryTime
        val hour = TimeUtils.getDate(timestamp,"yyyyMMddHH")
        val day = TimeUtils.getDate(timestamp,"yyyyMMdd")
        val month = TimeUtils.getDate(timestamp,"yyyyMM")
    
        val userID = value.userBrowse.userId
        val browser= value.userBrowse.browserType
    
        val state: UserState = UserState.getUserState(userID.toString, timestamp)
        val isNew = state.isNew
        val isFirstHour = state.isFirstHour
        val isFirstDay = state.isFirstDay
        val isFirstMonth =state.isFirstMonth
    
        val count:Long = 1L
        var newCount =0L
        var hourOldCount =0L
        var dayOldCount =0L
        var monthOldCount =0L
    
        if(isNew){
          newCount =1L
        }else{
          if(isFirstHour){
            hourOldCount=1L
          }
          if(isFirstDay){
            dayOldCount=1L
          }
          if(isFirstMonth){
            monthOldCount=1L
          }
        }
        out.collect(ChannelBrowser(browser,count,newCount,hourOldCount,timestamp,hour))
        out.collect(ChannelBrowser(browser,count,newCount,dayOldCount,timestamp,day))
        out.collect(ChannelBrowser(browser,count,newCount,monthOldCount,timestamp,month))
      }
    }
    4.reduce
    package com.ityouxin.process.reduce
    
    import com.ityouxin.process.bean.ChannelBrowser
    import org.apache.flink.api.common.functions.ReduceFunction
    
    class ChannelBrowserReduce extends ReduceFunction[ChannelBrowser]{
      override def reduce(value1: ChannelBrowser, value2: ChannelBrowser): ChannelBrowser = {
        ChannelBrowser(value1.browser,
          value1.count+value2.count,
          value1.newCount+value2.newCount,
          value1.oldCount+value2.oldCount,
          value1.timestamp,
          value1.dateField
        )
      }
    }
    5.sink
    package com.ityouxin.process.sink
    
    import com.ityouxin.process.bean.ChannelBrowser
    import com.ityouxin.tools.HbaseUtils
    import org.apache.commons.lang.StringUtils
    import org.apache.flink.streaming.api.functions.sink.SinkFunction
    import org.apache.hadoop.hbase.TableName
    
    import scala.collection.mutable
    
    class ChannelBrowserSink extends SinkFunction[ChannelBrowser] {
      override def invoke(value: ChannelBrowser, context: SinkFunction.Context[_]): Unit = {
        val browser= value.browser
    
        val tableName = TableName.valueOf("device")
        val columnFamily="info"
        val nwCount = browser+"_Count"
        val nwNewCount = browser+"_NewCount"
        val nwOldCount = browser+"_OldCount"
        val rowKey = value.dateField
    
        //获取历史数据
        val countData = HbaseUtils.getTablec(tableName,rowKey,columnFamily,nwCount)
        val newCountData = HbaseUtils.getTablec(tableName,rowKey,columnFamily,nwNewCount)
        val oldCountData = HbaseUtils.getTablec(tableName,rowKey,columnFamily,nwOldCount)
    
        var count = value.count
        var newCount = value.newCount
        var oldCount = value.oldCount
    
        if(StringUtils.isNotBlank(countData)){
          count+=countData.toLong
        }
    
        if(StringUtils.isNotBlank(newCountData)){
          newCount+=newCountData.toLong
        }
    
        if(StringUtils.isNotBlank(oldCountData)){
          oldCount+=oldCountData.toLong
        }
    
        val map = new mutable.HashMap[String,String]()
        map.put(nwCount,count.toString)
        map.put(nwNewCount,newCount.toString)
        map.put(nwOldCount,oldCount.toString)
    
        HbaseUtils.putData(tableName,rowKey,columnFamily,map)
    
      }
    }
    
  10. 工具类

    HbaseUtils类

    package com.ityouxin.tools
    
    import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName}
    import org.apache.hadoop.conf.Configuration
    import org.apache.hadoop.hbase.client.{Admin, Connection, ConnectionFactory, Get, Put, Result, Table}
    import org.apache.hadoop.hbase.util.Bytes
    
    import scala.collection.mutable
    object HbaseUtils {
    
    
      //创建Hbase的配置conf
    private val conf:Configuration = HBaseConfiguration.create()
      conf.set("hbase.zookeeper.quorum",GlobalConfigUtils.zookeeperQuorum)
      conf.set("hbase.master",GlobalConfigUtils.hbaseMaster)
      conf.set("hbase.zookeeper.property.clientPort",GlobalConfigUtils.clientPort)
      conf.set("hbase.rpc.timeout",GlobalConfigUtils.rpcTimeout)
      conf.set("hbase.client.operator.timeout",GlobalConfigUtils.operatorTimeout)
      conf.set("hbase.client.scanner.timeout.period",GlobalConfigUtils.timeoutPeriod)
      //得到hbase的连接
      private val connection: Connection = ConnectionFactory.createConnection(conf)
      //得到admin
      private val admin:Admin = connection.getAdmin
    
      //创建或者获取表
      def createTable(tableName:TableName,columnFamily:String)={
        //获取表名
        val tableDescriptor = new HTableDescriptor(tableName)
    //获取字段列族
        val columnDescriptor = new HColumnDescriptor(columnFamily)
        tableDescriptor.addFamily(columnDescriptor)
        //判断表是否存在
        if (!admin.tableExists(tableName)){
          //如果表不存在则创建
          admin.createTable(tableDescriptor)
        }
        connection.getTable(tableName)
      }
      //获取表的数据
      def getTablec(tableName: TableName,rowKey:String,columnFamily:String,column:String)={
        var str:String=""
        val table= createTable(tableName,columnFamily)
        try{
          //rowKey转换为字节Bytes
          val get:Get = new Get(Bytes.toBytes(rowKey))
          //得到表的结果集
          val result: Result = table.get(get)
          //将列族和列添加到结果集中
          val bytes: Array[Byte] = result.getValue(Bytes.toBytes(columnFamily), Bytes.toBytes(column))
          //判断是否为空
          if(bytes!=null && bytes.size>0){
            str = Bytes.toString(bytes)
          }
        }catch {
          case e:Exception =>e.printStackTrace()
        }finally {
          table.close()
        }
        str
      }
      //写数据到hbase
      def  putData(tableName:TableName,rowKey:String,columnFamily:String,fieldsData:mutable.HashMap[String,String]): Unit ={
    //创建表
        val table: Table = createTable(tableName,columnFamily)
        try{
    
          val put = new Put(Bytes.toBytes(rowKey))
          if(fieldsData!=null && fieldsData.nonEmpty){
            for( (k,v) <- fieldsData  ){
              put.addColumn(Bytes.toBytes(columnFamily),Bytes.toBytes(k),Bytes.toBytes(v))
            }
            table.put(put)
          }
        }catch {
          case e:Exception => e.printStackTrace()
        }finally {
          table.close()
        }
    
      }
      def lpad(source:String,len:Int):String={
        var str = source
        if(str!=null){
          val strBuilder= new mutable.StringBuilder(str)
          while (str.length<len){
            str = strBuilder.insert(0,"0").toString()
          }
        }
        str
      }
    }
    
    

    TimeUtils类

    package com.ityouxin.tools
    
    import java.util.Date
    import org.apache.commons.lang.time.FastDateFormat
    
    object TimeUtils {
      def getDate(timestamp:Long,format:String):String={
        val time = new Date(timestamp)
        val dateFormat: FastDateFormat = FastDateFormat.getInstance(format)
        dateFormat.format(time)
      }
    }
    
    

    GlobalConfigUtils类

    package com.ityouxin.tools
    import com.typesafe.config.ConfigFactory
    object GlobalConfigUtils {
      //使用ConfigFactory加载资源文件
      private val conf = ConfigFactory.load()
      //获取资源文件配置
      def bootstrapServers:String = conf.getString("bootstrap.servers")
      def zookeeperConnect:String = conf.getString("zookeeper.connect")
      def inputTopic:String = conf.getString("input.topic")
      def groupId:String = conf.getString("group.id")
      def autoCommit:String = conf.getString("enable.auto.commit")
      def autoCommitInterval:String = conf.getString("auto.commit.interval.ms")
      def autoOffsetReset:String = conf.getString("auto.offset.reset")
      def zookeeperQuorum:String = conf.getString("hbase.zookeeper.quorum")
      def hbaseMaster:String = conf.getString("hbase.master")
      def clientPort:String = conf.getString("hbase.zookeeper.property.clientPort")
      def rpcTimeout:String = conf.getString("hbase.rpc.timeout")
      def operatorTimeout:String = conf.getString("hbase.client.operator.timeout")
      def timeoutPeriod:String = conf.getString("hbase.client.scanner.timeout.period")
      def maxDelayTime:Long = conf.getString("maxDelayTime").toLong
      def timeWindow:Long = conf.getString("timeWindow").toLong
    }
    
    
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值