综合分析平台-CSDN博客

本文链接：https://blog.csdn.net/yiyiqi123/article/details/106066447

package com.sdg.consumer.myhbase

import java.text.SimpleDateFormat
import java.util

import com.sdg.consumer.myutils.{ConnectionInstance, HBaseUtil, PropertiesUtil}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{HTable, Put}
import org.apache.hadoop.hbase.util.Bytes

object HbaseDao {

private val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
private val sdf2 = new SimpleDateFormat("yyyyMMddHHmmss")
private val cacheList = new util.ArrayList[Put]
var conf: Configuration = HBaseConfiguration.create
private var regions = Integer.valueOf(PropertiesUtil.getProperty("hbase.calllog.regions"))
private var namespace: String = PropertiesUtil.getProperty("hbase.calllog.namespace")
private var tableName: String = PropertiesUtil.getProperty("hbase.calllog.tablename")
var table: HTable = null
//首先创建命名空间  然后再创建表
if (!HBaseUtil.isExistTable(conf, tableName)) {
  //这里做一个判断就好
  HBaseUtil.initNamespace(conf, namespace)
  HBaseUtil.createTable(conf, tableName, regions, "f1", "f2")
}


/**
 * 把数据写入到hbase
 * ori数据样式： 18576581848,17269452013,2017-08-14 13:38:31,1761
 * rowkey样式：01_18576581848_20170814133831_17269452013_1_1761
 * HBase表的列：call1  call2   build_time   build_time_ts   flag   duration
 *
 * @param ori
 */
def put(ori: String): Unit = {
  //插入方法
  //put方式
  //hive+hbase方式
  //phoenix(sql)==>squueirl（松鼠）
  //buldload==>效率最高 hfile:效率最高
  if (cacheList.size == 0) {
    val connection = ConnectionInstance.getConnection(conf)
    table = connection.getTable(TableName.valueOf(tableName)).asInstanceOf[HTable]
    table.setAutoFlushTo(false)
    //
    table.setWriteBufferSize(2 * 1024 * 1024)
  }
  //对传输过来的字符串用逗号进行分割 18468618874,魏明艳,13980337439,卫艺,2017-11-25 13:05:27,1088,1
  val splitOri: Array[String] = ori.split(",")
  val caller: String = splitOri(0)
  val callee: String = splitOri(1)
  val buildTime: String = splitOri(4)
  val duration: String = splitOri(5)
  //获取region编码
  val regionCode: String = HBaseUtil.genRegionCode(caller, buildTime, regions)
  //建立通话时间
  val buildTimeReplace: String = sdf2.format(sdf1.parse(buildTime))
  val buildTimeTs: String = String.valueOf(sdf1.parse(buildTime).getTime)
  //生成rowkey
  val rowkey: String = HBaseUtil.genRowKey(regionCode, caller, buildTimeReplace, callee, "1", duration)
  //向表中插入该条数据
  val put: Put = new Put(Bytes.toBytes(rowkey))
  //主叫号码
  put.addColumn(Bytes.toBytes("f1"), Bytes.toBytes("call1"), Bytes.toBytes(caller))
  //被叫号码
  put.addColumn(Bytes.toBytes("f1"), Bytes.toBytes("call2"), Bytes.toBytes(callee))
  //通话日期
  put.addColumn(Bytes.toBytes("f1"), Bytes.toBytes("build_time"), Bytes.toBytes(buildTime))
  //通话时间
  put.addColumn(Bytes.toBytes("f1"), Bytes.toBytes("build_time_ts"), Bytes.toBytes(buildTimeTs))
  //通过标识
  put.addColumn(Bytes.toBytes("f1"), Bytes.toBytes("flag"), Bytes.toBytes("1"))
  //通话时长
  put.addColumn(Bytes.toBytes("f1"), Bytes.toBytes("duration"), Bytes.toBytes(duration))
  cacheList.add(put)
  if (cacheList.size >= 0) {
    // val value: List[Put] =  util.List[Put]
    //这个类型必须是java的List,scala 的类型是不支持的
    table.put(cacheList)
    //把数据提交到hbase的表中
    table.flushCommits()
    println("插入数据成功")
    //清除缓冲的内容
    cacheList.clear()
  }
  //1.定时接受数据
  //2.for循环之外在提交一次
}

}

package com.sdg.consumer.mykafka

//命名包名的时候不要冲突
import java.util

import com.sdg.consumer.myhbase.HbaseDao
import com.sdg.consumer.myutils.PropertiesUtil
import org.apache.kafka.clients.consumer.{ConsumerRecords, KafkaConsumer}

import scala.collection.JavaConversions._

/**

KafkaToHbase
把数据写入到hbase 中
*/
object HbaseConsumer {

def main(args: Array[String]): Unit = {
// HbaseDao.put(“18468618874,魏明艳,13980337439,卫艺,2017-11-25 13:05:27,1088,1”)
//testHbase()
//创建kafka消费者的对象
val kafkaConsumer = new KafkaConsumerString, String
//订阅指定的topic 用于数据的消费
kafkaConsumer.subscribe(util.Arrays.asList(PropertiesUtil.getProperty(“kafka.topics”)))
println(“等待消费数据--------------”)
while (true) {
//每0.1S 从指定topic中消费数据
val records: ConsumerRecords[String, String] = kafkaConsumer.poll(100)
//这个是scala和java集合类型之间的转换
for (cr <- records) {
//得到每条数据的value
val str: String = cr.value()
println(str)
//把数据写入到hbase中
HbaseDao.put(str)
}
}

}

def testHbase(): Unit = {
val str = “18468618874,魏明艳,13980337439,卫艺,2017-11-25 13:05:27,1088,1”
HbaseDao.put(str)
}

}

package com.sdg.consumer.myutils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.client.{Connection, ConnectionFactory}

//获取连接的
object ConnectionInstance {

private var conn: Connection = null

def getConnection(conf: Configuration): Connection = {

if (conn == null || conn.isClosed) {
  conn = ConnectionFactory.createConnection(conf);
}
conn

}
}

package com.sdg.consumer.myutils
import java.text.DecimalFormat
import java.util

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.client.{Admin, Connection, ConnectionFactory}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HColumnDescriptor, HTableDescriptor, NamespaceDescriptor, TableName}

object HBaseUtil {

/**
 * regionCode_call1_buildTime_call2_flag_duration
 *
 * @param regionCode 区号
 * @param caller     主叫号码
 * @param buildTime  建立时间
 * @param callee     被叫号码
 * @param flag       主动被动标识
 * @param duration   通话时长
 * @return
 */
def genRowKey(regionCode: String, caller: String, buildTime: String, callee: String, flag: String, duration: String): String = {
  val sb = new StringBuilder
  sb.append(regionCode + "_")
    .append(caller + "_")
    .append(buildTime + "_")
    .append(caller + "_")
    .append(flag + "_")
    .append(duration)
  sb.toString()
}


/**
 * 获取区号
 *
 * @param call1
 * @param buildTime
 * @param regions
 * @return
 */
def genRegionCode(call1: String, buildTime: String, regions: Integer): String = {
  //电话号码的长度
  val len: Int = call1.length
  //取出后4位号码
  val lastPhone: String = call1.substring(len - 4)
  //取出建立通过时间的年月2018-02-02
  val ym: String = buildTime.replaceAll("-", "")
    .replaceAll(":", "")
    .replaceAll(" ", "")
    .substring(0, 6)
  //离散操作1 ^  这个符号是异或运算  转成二进制 对应位置相同为 0   不同就为1
  val x: Integer = Integer.valueOf(lastPhone) ^ Integer.valueOf(ym)
  //离散操作2
  val y: Int = x.hashCode
  //生成分区号
  val regionCode: Int = y % regions
  //格式化分区号
  val df = new DecimalFormat("00")

  df.format(regionCode)
}


/**
 * 预分区键
 * 创建区号
 * @param regions
 * @return
 */
def genSplitKeys(regions: Integer): Array[Array[Byte]] = {
  //定义一个存放分区键的数组
  val keys: Array[String] = new Array[String](regions)
  //目前推算，region个数不会超过2位数，所以region分区键格式化为两位数字所代表的字符串
  val df: DecimalFormat = new DecimalFormat("00")
  //对region个数遍历
  for (i <- 0 until regions) {
    //使用 | 拼接一下
    keys(i) = df.format(i) + "|"
  }
  //定义一个二维数组
  val splitKeys = new Array[Array[Byte]](regions)
  //比较器 BYTES_COMPARATOR :升序排序
  val treeSet: util.TreeSet[Array[Byte]] = new util.TreeSet[Array[Byte]](Bytes.BYTES_COMPARATOR)
  for (i <- 0 until regions) {
    //把我们生成keys 放进去
    treeSet.add(Bytes.toBytes(keys(i)));
  }

  val splitKeysIterator: util.Iterator[Array[Byte]] = treeSet.iterator
  var index = 0
  while (splitKeysIterator.hasNext) {
    val b: Array[Byte] = splitKeysIterator.next
    println(b)

    splitKeys(index) = b
    index = index + 1
  }
  splitKeys
}

/**
 *
 * @param conf
 * @param tableName    表名
 * @param regions      分区个数
 * @param columnFamily 列簇（连个列簇）
 */
def createTable(conf: Configuration, tableName: String, regions: Integer, columnFamily: String*) = {
  val connection: Connection = ConnectionFactory.createConnection(conf)
  val admin: Admin = connection.getAdmin
  //if (isExistTable(conf, tableName)) return
  val htd = new HTableDescriptor(TableName.valueOf(tableName))
  for (cf <- columnFamily) {
    htd.addFamily(new HColumnDescriptor(cf))
  }
  //创建表的时候指定支持协处理器
  //htd.addCoprocessor("hbase.CalleeWriteObserver")
  //指定与分区指定分区的个数
  admin.createTable(htd, genSplitKeys(regions))
  //此下方法创建的表只能有一个region分区

// admin.createTable(htd)

  admin.close()
  connection.close()
}

def main(args: Array[String]): Unit = {
  /*  val conf: Configuration = HbaseDao.conf
    val connection: Connection = ConnectionFactory.createConnection(conf)
    val admin: Admin = connection.getAdmin
    //if (isExistTable(conf, tableName)) return
    val htd = new HTableDescriptor(TableName.valueOf(" "))
    //增加协处理器
    //htd.addCoprocessor("hbase.CalleeWriteObserver")

    //指定与分区指定分区的个数
    admin.createTable(htd, genSplitKeys(6))
    admin.close()
    connection.close()*/

  /*val array = genSplitKeys(6)
  println(Bytes.toString(array(0)))
  println(Bytes.toString(array(1)))
  println(Bytes.toString(array(2)))
  println(Bytes.toString(array(3)))
  println(Bytes.toString(array(4)))
  println(Bytes.toString(array(5)))
  println(Bytes.toString(array(6)))*/

  val str: String = genRegionCode("13526949099", "2018-02-02", 6)
  println(str)
}

/**
 * 初始化命名空间
 *
 * @param conf
 * @param namespace
 */
def initNamespace(conf: Configuration, namespace: String) = {
  //获取hbase链接
  val connection: Connection = ConnectionFactory.createConnection(conf)
  //获取admin对象
  val admin: Admin = connection.getAdmin
  //创建命令空间
  val nd: NamespaceDescriptor = NamespaceDescriptor.create(namespace)
    .addConfiguration("CREATE_TIME", String.valueOf(System.currentTimeMillis)).addConfiguration("AUTHOR", "liuhe").build
  admin.createNamespace(nd)
  admin.close()
  connection.close()
}

/**
 * 判断表是否存在
 *
 * @param conf
 * @param tableName
 */
def isExistTable(conf: Configuration, tableName: String): Boolean = {
  val connection: Connection = ConnectionFactory.createConnection(conf)
  val admin: Admin = connection.getAdmin
  //判断表是否存在
  val result: Boolean = admin.tableExists(TableName.valueOf(tableName))
  admin.close()
  connection.close()
  result
}

}

package com.sdg.consumer.myutils

import java.io.InputStream
import java.util.Properties

//读取配置文件信息
object PropertiesUtil {

val is: InputStream = ClassLoader.getSystemResourceAsStream(“hbase_consumer.properties”)
var properties = new Properties
properties.load(is)

//根据key 取出来对应的值
def getProperty(key: String): String = {
val str: String = properties.getProperty(key)
str
}

}