hbase查询代码用列

最新推荐文章于 2024-06-28 15:58:59 发布

fan_bigdata

最新推荐文章于 2024-06-28 15:58:59 发布

阅读量604

点赞数

文章标签： hbase hadoop 大数据

本文链接：https://blog.csdn.net/fan_bigdata/article/details/128788688

版权

工程依赖引入

<dependency>
    <groupId>org.apache.hbase</groupId>
    <artifactId>hbase-client</artifactId>
    <version>2.4.0</version>
</dependency>
<dependency>
    <groupId>org.apache.hbase</groupId>
    <artifactId>hbase-common</artifactId>
    <version>2.4.0</version>
</dependency>
<dependency>
    <groupId>org.apache.hbase</groupId>
    <artifactId>hbase-mapreduce</artifactId>
    <version>2.4.0</version>
</dependency>

案例1. 通过一个rowkey前缀匹配查询hbase

import org.apache.hadoop.hbase.{CellUtil, HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Get, Scan}
import org.apache.hadoop.hbase.util.Bytes

object HbaseByRowkey {
  def main(args: Array[String]): Unit = {

    val tableName = "tableName"
    val hbaseConf = HBaseConfiguration.create()
    val conn = ConnectionFactory.createConnection(hbaseConf)
    val table = conn.getTable(TableName.valueOf(tableName))

    val scan = new Scan
    scan.addColumn(Bytes.toBytes("base_info"), Bytes.toBytes("column_name"))
    val aidR = "0111258355840909024"
    scan.setRowPrefixFilter(Bytes.toBytes(aidR))

    val res = table.getScanner(scan)

    import scala.collection.JavaConversions._
    for (re <- res) {
      val cells = re.rawCells()
      val row = re.getRow
      val rowkey = Bytes.toString(row)
      cells.map { cell =>
        val cf = Bytes.toString(CellUtil.cloneFamily(cell))
        val name = Bytes.toString(CellUtil.cloneQualifier(cell))
        val value = CellUtil.cloneValue(cell)
        val data = (rowkey, cf, name, value)
        println(data)
      }
    }

    conn.close()
  }
}

案例2. 通过多个rowkey前缀匹配查询hbase

import org.apache.hadoop.hbase.{CellUtil, CompareOperator, HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Scan}
import org.apache.hadoop.hbase.filter.{BinaryPrefixComparator, FilterList, RowFilter}
import org.apache.hadoop.hbase.util.Bytes


object HbaseByRowkey {
  def main(args: Array[String]): Unit = {

    val tableName = "tableName"
    val hbaseConf = HBaseConfiguration.create()
    val conn = ConnectionFactory.createConnection(hbaseConf)
    val table = conn.getTable(TableName.valueOf(tableName))

    val scan = new Scan

    scan.setCaching(1000)
    scan.setBatch(100)
    scan.addColumn(Bytes.toBytes("base_info"), Bytes.toBytes("column_name"))
	
	// MUST_PASS_ONE 表示有其中一个条件满足就行，MUST_PASS_ALL表示所有条件都必须满足
    val filterList  = new FilterList(FilterList.Operator.MUST_PASS_ONE)
	
     /*
      * （1）比较运算符 CompareFilter.CompareOp
      * 比较运算符用于定义比较关系，可以有以下几类值供选择：
      * EQUAL                                  相等
      * GREATER                              大于
      * GREATER_OR_EQUAL           大于等于
      * LESS                                      小于
      * LESS_OR_EQUAL                  小于等于
      * NOT_EQUAL                        不等于
      *
      * （2）比较器  ByteArrayComparable
      * 通过比较器可以实现多样化目标匹配效果，比较器有以下子类可以使用：
      * BinaryComparator               匹配完整字节数组 
      * BinaryPrefixComparator     匹配字节数组前缀 
      * BitComparator
      * NullComparator
      * RegexStringComparator    正则表达式匹配
      * SubstringComparator        子串匹配
      */
    val filter1 = new RowFilter(CompareOperator.EQUAL,new BinaryPrefixComparator(Bytes.toBytes("0111258355840909024")))
    val filter2 = new RowFilter(CompareOperator.EQUAL,new BinaryPrefixComparator(Bytes.toBytes("0025047552855909024")))

    filterList.addFilter(filter1)
    filterList.addFilter(filter2)

    scan.setFilter(filterList)
    
    val res = table.getScanner(scan)

    import scala.collection.JavaConversions._
    for (re <- res) {
      val cells = re.rawCells()
      val row = re.getRow
      val rowkey = Bytes.toString(row)
      cells.map { cell =>
        val cf = Bytes.toString(CellUtil.cloneFamily(cell))
        val name = Bytes.toString(CellUtil.cloneQualifier(cell))
        val value = CellUtil.cloneValue(cell)
        val data = (rowkey, cf, name, value)
        println(data)
      }
    }

    conn.close()
  }
}

案例3. 根据rowkey查询所有列的数据

   val tableName = "dim_bigdata_event_face_person_5073"

    val hbaseConf = HBaseConfiguration.create()
    val conn = ConnectionFactory.createConnection(hbaseConf)
    val table = conn.getTable(TableName.valueOf(tableName))

    import scala.collection.JavaConversions._
    val aid1 = new Get(Bytes.toBytes("0111004209090668762497030"))
    val aid2 = new Get(Bytes.toBytes("0111004209090668829605896"))
    val gets: List[Get] = List(aid1, aid2)

    val res = table.get(gets)

    // aid,thumbnail_id,feature_info as feature
    for (re <- res) {
      val cells = re.rawCells()
      val row = re.getRow
      val rowkey = Bytes.toString(row)
      //      print(rowkey)
     val resultList =  cells.map(cell => {
        val cf = Bytes.toString(CellUtil.cloneFamily(cell))
        val name = Bytes.toString(CellUtil.cloneQualifier(cell))
        if (name.equalsIgnoreCase("quality_info")) {
          val value = Bytes.toFloat(CellUtil.cloneValue(cell))
          (name, value)
        } else if (name.equalsIgnoreCase("aid") || name.equalsIgnoreCase("thumbnail_id")) {
          val value1 = Bytes.toString(CellUtil.cloneValue(cell))
          (name, value1)
        } else if (name.equalsIgnoreCase("feature_info")) {
          val value1 = CellUtil.cloneValue(cell)
          (name, value1)
        } else {
          null
        }
      }).filter(_ != null).foreach(println(_))
	}

案例4. 根据timeRange过滤数据

val conf = HBaseConfiguration.create()
val conn = ConnectionFactory.createConnection()
conf.set(TableInputFormat.INPUT_TABLE, sourceArchiveTable)

val scan = new Scan()
scan.setCaching(1000)
scan.setCacheBlocks(false)
scan.setBatch(100)

scan.addColumn(Bytes.toBytes("base_info"), Bytes.toBytes("person_name"))

val df = new SimpleDateFormat("yyyy-HH-dd hh:mm:ss")
val st = df.parse(startTime).getTime
val dt = df.parse(endTime).getTime

scan.setTimeRange(st, dt)

conf.set(TableInputFormat.SCAN, TableMapReduceUtil.convertScanToString(scan))
val hbaseRDD = spark.sparkContext.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])

val count = hbaseRDD.count()
println("==========> partitions : " + hbaseRDD.partitions)
println("==========> count :" + count)

import spark.implicits._

val resultDF = if (count == 0) {
  spark.emptyDataFrame.withColumn("archive_id", lit(null)).withColumn("name", lit(null))
} else {
  hbaseRDD.map(row => {
    val key: String = Bytes.toString(row._1.get())
    val archiveId = key.reverse
    val cells = row._2.rawCells()
    val values = cells.map(cell => {
      val value = Bytes.toString(CellUtil.cloneValue(cell))
      value
    }).head
    (archiveId, values)
  }).filter(!_._2.isEmpty)
    .toDF("archive_id", "name")
}

conn.close()