工程依赖引入
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>2.4.0</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-common</artifactId>
<version>2.4.0</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-mapreduce</artifactId>
<version>2.4.0</version>
</dependency>
案例1. 通过一个rowkey前缀匹配查询hbase
import org.apache.hadoop.hbase.{CellUtil, HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Get, Scan}
import org.apache.hadoop.hbase.util.Bytes
object HbaseByRowkey {
def main(args: Array[String]): Unit = {
val tableName = "tableName"
val hbaseConf = HBaseConfiguration.create()
val conn = ConnectionFactory.createConnection(hbaseConf)
val table = conn.getTable(TableName.valueOf(tableName))
val scan = new Scan
scan.addColumn(Bytes.toBytes("base_info"), Bytes.toBytes("column_name"))
val aidR = "0111258355840909024"
scan.setRowPrefixFilter(Bytes.toBytes(aidR))
val res = table.getScanner(scan)
import scala.collection.JavaConversions._
for (re <- res) {
val cells = re.rawCells()
val row = re.getRow
val rowkey = Bytes.toString(row)
cells.map { cell =>
val cf = Bytes.toString(CellUtil.cloneFamily(cell))
val name = Bytes.toString(CellUtil.cloneQualifier(cell))
val value = CellUtil.cloneValue(cell)
val data = (rowkey, cf, name, value)
println(data)
}
}
conn.close()
}
}
案例2. 通过多个rowkey前缀匹配查询hbase
import org.apache.hadoop.hbase.{CellUtil, CompareOperator, HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Scan}
import org.apache.hadoop.hbase.filter.{BinaryPrefixComparator, FilterList, RowFilter}
import org.apache.hadoop.hbase.util.Bytes
object HbaseByRowkey {
def main(args: Array[String]): Unit = {
val tableName = "tableName"
val hbaseConf = HBaseConfiguration.create()
val conn = ConnectionFactory.createConnection(hbaseConf)
val table = conn.getTable(TableName.valueOf(tableName))
val scan = new Scan
scan.setCaching(1000)
scan.setBatch(100)
scan.addColumn(Bytes.toBytes("base_info"), Bytes.toBytes("column_name"))
// MUST_PASS_ONE 表示有其中一个条件满足就行,MUST_PASS_ALL表示所有条件都必须满足
val filterList = new FilterList(FilterList.Operator.MUST_PASS_ONE)
/*
* (1)比较运算符 CompareFilter.CompareOp
* 比较运算符用于定义比较关系,可以有以下几类值供选择:
* EQUAL 相等
* GREATER 大于
* GREATER_OR_EQUAL 大于等于
* LESS 小于
* LESS_OR_EQUAL 小于等于
* NOT_EQUAL 不等于
*
* (2)比较器 ByteArrayComparable
* 通过比较器可以实现多样化目标匹配效果,比较器有以下子类可以使用:
* BinaryComparator 匹配完整字节数组
* BinaryPrefixComparator 匹配字节数组前缀
* BitComparator
* NullComparator
* RegexStringComparator 正则表达式匹配
* SubstringComparator 子串匹配
*/
val filter1 = new RowFilter(CompareOperator.EQUAL,new BinaryPrefixComparator(Bytes.toBytes("0111258355840909024")))
val filter2 = new RowFilter(CompareOperator.EQUAL,new BinaryPrefixComparator(Bytes.toBytes("0025047552855909024")))
filterList.addFilter(filter1)
filterList.addFilter(filter2)
scan.setFilter(filterList)
val res = table.getScanner(scan)
import scala.collection.JavaConversions._
for (re <- res) {
val cells = re.rawCells()
val row = re.getRow
val rowkey = Bytes.toString(row)
cells.map { cell =>
val cf = Bytes.toString(CellUtil.cloneFamily(cell))
val name = Bytes.toString(CellUtil.cloneQualifier(cell))
val value = CellUtil.cloneValue(cell)
val data = (rowkey, cf, name, value)
println(data)
}
}
conn.close()
}
}
案例3. 根据rowkey查询所有列的数据
val tableName = "dim_bigdata_event_face_person_5073"
val hbaseConf = HBaseConfiguration.create()
val conn = ConnectionFactory.createConnection(hbaseConf)
val table = conn.getTable(TableName.valueOf(tableName))
import scala.collection.JavaConversions._
val aid1 = new Get(Bytes.toBytes("0111004209090668762497030"))
val aid2 = new Get(Bytes.toBytes("0111004209090668829605896"))
val gets: List[Get] = List(aid1, aid2)
val res = table.get(gets)
// aid,thumbnail_id,feature_info as feature
for (re <- res) {
val cells = re.rawCells()
val row = re.getRow
val rowkey = Bytes.toString(row)
// print(rowkey)
val resultList = cells.map(cell => {
val cf = Bytes.toString(CellUtil.cloneFamily(cell))
val name = Bytes.toString(CellUtil.cloneQualifier(cell))
if (name.equalsIgnoreCase("quality_info")) {
val value = Bytes.toFloat(CellUtil.cloneValue(cell))
(name, value)
} else if (name.equalsIgnoreCase("aid") || name.equalsIgnoreCase("thumbnail_id")) {
val value1 = Bytes.toString(CellUtil.cloneValue(cell))
(name, value1)
} else if (name.equalsIgnoreCase("feature_info")) {
val value1 = CellUtil.cloneValue(cell)
(name, value1)
} else {
null
}
}).filter(_ != null).foreach(println(_))
}
案例4. 根据timeRange过滤数据
val conf = HBaseConfiguration.create()
val conn = ConnectionFactory.createConnection()
conf.set(TableInputFormat.INPUT_TABLE, sourceArchiveTable)
val scan = new Scan()
scan.setCaching(1000)
scan.setCacheBlocks(false)
scan.setBatch(100)
scan.addColumn(Bytes.toBytes("base_info"), Bytes.toBytes("person_name"))
val df = new SimpleDateFormat("yyyy-HH-dd hh:mm:ss")
val st = df.parse(startTime).getTime
val dt = df.parse(endTime).getTime
scan.setTimeRange(st, dt)
conf.set(TableInputFormat.SCAN, TableMapReduceUtil.convertScanToString(scan))
val hbaseRDD = spark.sparkContext.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])
val count = hbaseRDD.count()
println("==========> partitions : " + hbaseRDD.partitions)
println("==========> count :" + count)
import spark.implicits._
val resultDF = if (count == 0) {
spark.emptyDataFrame.withColumn("archive_id", lit(null)).withColumn("name", lit(null))
} else {
hbaseRDD.map(row => {
val key: String = Bytes.toString(row._1.get())
val archiveId = key.reverse
val cells = row._2.rawCells()
val values = cells.map(cell => {
val value = Bytes.toString(CellUtil.cloneValue(cell))
value
}).head
(archiveId, values)
}).filter(!_._2.isEmpty)
.toDF("archive_id", "name")
}
conn.close()