Spark读写HBase操作（scala版）

最新推荐文章于 2024-01-09 10:28:06 发布

武武★♞★

最新推荐文章于 2024-01-09 10:28:06 发布

阅读量772

点赞数

分类专栏： spark学习历程文章标签： hbase spark scala

本文链接：https://blog.csdn.net/weixin_49138903/article/details/130152993

版权

spark学习历程专栏收录该内容

1 篇文章 0 订阅

订阅专栏

学习spark，记录使用SparkSQL的DataFrame读写HBase操作。

spark提供newAPIHadoopRDD()方法和saveAsNewAPIHadoopDataset()方法分别实现对HBase的读和写操作

1.读取HBase数据

本次记录两种情况读取HBase数据:

第一种是只知道HBase表名，将数据读成如下schema信息的DataFrame：

StructType(List(
StructField(rowKey,StringType,false),
StructField(columns,MapType(StringType,StringType,true),true)
))

第二种可以获取到每个cell列的数据类型，本次操作是指定了读取的列(具体情况根据业务而定)，定义一个JavaBean接收每一列信息：

public class FieldPropertyBean{
 //指定的列名
 private String key;
 //数据类型
 private String value;

 public String getKey(){return key;}

 public void setKey(String key){this.key=key;}
 
 public String getValue(){return value;}

 public void setValue(String value){this.value=value;}
}

根据指定的列和存储数据时的数据类型，生成对应的DataFrame scheme信息。具体实现代码如下：

def readHBaseData(spark: SparkSession): DataFrame = {
    //获取配置信息
    val zk_quorum:String="192.168.1.5"
    val zk_client_port:String="2181"
    val hb_table: String = "table-001"
    val start_row: String = 10
    val stop_row: String = 100

    //创建HBase配置并设置配置信息
    val hbaseConfig: Configuration = HBaseConfiguration.create()
    hbaseConfig.set("hbase.zookeeper.quorum",zk_quorum)
    hbaseConfig.set("hbase.zookeeper.property.clientPort",zk_client_port)
    hbaseConfig.set(TableInputFormat.INPUT_TABLE, hb_table)
    if (!start_row.isEmpty) {
      hbaseConfig.set(TableInputFormat.SCAN_ROW_START, start_row)
    }
    if (!stop_row.isEmpty) {
      hbaseConfig.set(TableInputFormat.SCAN_ROW_STOP, stop_row)
    }
    //根据指定列，获取每列结构信息
    var structFieldList: List[StructField] = List()//记录schema信息
    if (isSet_column) {
      //根据配置信息获取指定需要读取的列
      val fieldList: List[FieldPropertyBean] = this.conf.column.asScala.toList
      var columnSet: Set[String] = Set() //保存列信息
      fieldList.foreach((x: FieldPropertyBean) => {
        val column: String = x.getkey()
        columnSet.+=(column)
        val columnType: String = x.getValue() 
        //根据获取的列类型模式匹配scheme的类型信息
        val fieldType: DataType = columnType match {
          case "String" => StringType
          case "Integer" => IntegerType
          case "Long" => LongType
          case "Double" => DoubleType
          case "Float" => FloatType
          case _ => StringType
        }
        val structField: StructField = StructField.apply(column, fieldType, true)
        structFieldList :+= structField
      }
      )
      hbaseConfig.set(TableInputFormat.SCAN_COLUMNS, columnSet.mkString(" "))
    }
    //获取数据，生成RDD
    val hbaseRDD: RDD[(ImmutableBytesWritable, Result)] = 
    spark.sparkContext.newAPIHadoopRDD(hbaseConfig,
      classOf[TableInputFormat],
      classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
      classOf[org.apache.hadoop.hbase.client.Result])
    //缓存数据
    hbaseRDD.cache()
    //转DataFrame (rowKey column:qulifater->value)
    if (structFieldList.isEmpty) {
      import spark.sqlContext.implicits._
      val df = hbaseRDD.mapPartitions { item =>
        item.map {
          case (_, result: Result) =>
            val rowKey: String = Bytes.toString(result.getRow)
            val families = result.getNoVersionMap.keySet().iterator()
            var columns = Map[String, String]()
            while (families.hasNext) {
              val family = families.next()
              val qualifiers = result.getFamilyMap(family).keySet().iterator()
              while (qualifiers.hasNext) {
                val qualifier = qualifiers.next()
                columns += ((Bytes.toString(family) + ":" + (Bytes.toString(qualifier)) -> Bytes.toString(result.getValue(family, qualifier))))
              }
            }
            (rowKey, columns)
        }
      }.toDF("HBaseRowKey", "columns")
      df
    } else {
      val schema: StructType = StructType(structFieldList)
      val rddRow: RDD[Row] = hbaseRDD.mapPartitions { part => {
        part.map {
          case (_, result: Result) => {
            var rowSeq: Seq[Any] = Seq[Any]()
            val rowKey: String = Bytes.toString(result.getRow)
            rowSeq :+= rowKey
            schema.foreach(item => {
              val name: String = item.name
              if (name.contains(":")) {
                val family: String = name.split(":")(0)
                val qualifier: String = name.split(":")(1)
                val tempValue:Array[Byte] = result.getValue(Bytes.toBytes(family), Bytes.toBytes(qualifier))
                if (!tempValue.isEmpty) {
                  val value = item.dataType.typeName match {
                    case "string" => Bytes.toString(tempValue)
                    case "integer" => Bytes.toInt(tempValue)
                    case "long" => Bytes.toLong(tempValue)
                    case "double" => Bytes.toDouble(tempValue)
                    case "float" => Bytes.toFloat(tempValue)
                    case _ => Bytes.toString(tempValue)
                  }
                  rowSeq :+= value
                }
              }
            })
            Row.fromSeq(rowSeq)
          }
        }
      }
      }
      val rowKey: StructField = StructField.apply("HBaseRowKey", StringType, false)
      val newSchema: StructType = StructType(schema.+:(rowKey))
      spark.createDataFrame(rddRow, newSchema)
    }
  }

2.写Hbase后续发...

武武★♞★

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
1
评论
Spark读写HBase操作（scala版）

学习spark，记录使用SparkSQL的DataFrame读写HBase操作。spark提供newAPIHadoopRDD()方法和saveAsNewAPIHadoopDataset()方法分别实现对HBase的读和写操作。
复制链接

扫一扫