业务场景:Flink从HBase中抽取增量的数据,IP_COPYRIGHT_SOFTWARE_LIST有一条新的记录产生时,清洗出COMPANY_ID,IP_SOFRWARE_ID外连接IP_COPYRIGHT_SOFTWARE,
判断USE_FLAG是否为0,不为零则不累加计算每次用COMPANY_ID 去IP_COPYRIGHT_SOFTWARE_LIST表里做统计IP-SOFTWARE_ID
对于该业务场景下,目前只是先做到关于其全量数据的计算
自定义source
package com.dl.hbase
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.source.RichSourceFunction
import org.apache.flink.streaming.api.functions.source.SourceFunction.SourceContext
import org.apache.hadoop.hbase.client.{
Connection, ConnectionFactory, Scan, Table}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{
Cell, HBaseConfiguration, HConstants, KeyValue, TableName}
import scala.collection.JavaConverters._
class HBaseReaderFromIP_COPYRIGHT_SOFTWARE_List extends RichSourceFunction[(String, String)] with Serializable{
/***
* connection 与Hbase的链接
* table 数据来源的表
* scan 获取数据
*/
private var conn: Connection = null
private var table: Table = null
private var scan: Scan = null
/**
* 在open方法使用HBase的客户端连接
* @param parameters
*/
override def open(parameters: Configuration): Unit = {
val config: org.apache.hadoop.conf.Configuration = HBaseConfiguration.create()
/**
*相关连接参数
* HBASE_CLIENT_OPERATION_TIMEOUT 操作超时时间
* HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD 扫描超时时间
*/
config.set(HConstants.ZOOKEEPER_QUORUM, "192.168.248.132")
config.set(HConstants.ZOOKEEPER_CLIENT_PORT, "2181")
/*
config.setInt(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT, 30000)
config.setInt(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 30000)
*/
val tableName: TableName = TableName.valueOf("IP_COPYRIGHT_SOFTWARE_LIST")
val cf1: String = "info"
conn = ConnectionFactory.createConnection(config)
table = conn.getTable(tableName)
scan = new Scan()
scan.addFamily(Bytes.toBytes(cf1))
}
/**
* run方法来自java的接口文件SourceFunction,使用IDEA工具Ctrl + o 无法便捷获取到该方法,直接override会提示
* @param sourceContext
*/
override def run(sourceContext: SourceContext[(String, String)])