学习spark,记录使用SparkSQL的DataFrame读写HBase操作。
spark提供newAPIHadoopRDD()方法和saveAsNewAPIHadoopDataset()方法分别实现对HBase的读和写操作
1.读取HBase数据
本次记录两种情况读取HBase数据:
第一种是只知道HBase表名,将数据读成如下schema信息的DataFrame:
StructType(List(
StructField(rowKey,StringType,false),
StructField(columns,MapType(StringType,StringType,true),true)
))
第二种可以获取到每个cell列的数据类型,本次操作是指定了读取的列(具体情况根据业务而定),定义一个JavaBean接收每一列信息:
public class FieldPropertyBean{
//指定的列名
private String key;
//数据类型
private String value;
public String getKey(){return key;}
public void setKey(String key){this.key=key;}
public String getValue(){return value;}
public void setValue(String value){this.value=value;}
}
根据指定的列和存储数据时的数据类型,生成对应的DataFrame scheme信息。具体实现代码如下:
def readHBaseData(spark: SparkSession): DataFrame = {
//获取配置信息
val zk_quorum:String="192.168.1.5"
val zk_client_port:String="2181"
val hb_table: String = "table-001"
val start_row: String = 10
val stop_row: String = 100
//创建HBase配置并设置配置信息
val hbaseConfig: Configuration = HBaseConfiguration.create()
hbaseConfig.set("hbase.zookeeper.quorum",zk_quorum)
hbaseConfig.set("hbase.zookeeper.property.clientPort",zk_client_port)
hbaseConfig.set(TableInputFormat.INPUT_TABLE, hb_table)
if (!start_row.isEmpty) {
hbaseConfig.set(TableInputFormat.SCAN_ROW_START, start_row)
}
if (!stop_row.isEmpty) {
hbaseConfig.set(TableInputFormat.SCAN_ROW_STOP, stop_row)
}
//根据指定列,获取每列结构信息
var structFieldList: List[StructField] = List()//记录schema信息
if (isSet_column) {
//根据配置信息获取指定需要读取的列
val fieldList: List[FieldPropertyBean] = this.conf.column.asScala.toList
var columnSet: Set[String] = Set() //保存列信息
fieldList.foreach((x: FieldPropertyBean) => {
val column: String = x.getkey()
columnSet.+=(column)
val columnType: String = x.getValue()
//根据获取的列类型模式匹配scheme的类型信息
val fieldType: DataType = columnType match {
case "String" => StringType
case "Integer" => IntegerType
case "Long" => LongType
case "Double" => DoubleType
case "Float" => FloatType
case _ => StringType
}
val structField: StructField = StructField.apply(column, fieldType, true)
structFieldList :+= structField
}
)
hbaseConfig.set(TableInputFormat.SCAN_COLUMNS, columnSet.mkString(" "))
}
//获取数据,生成RDD
val hbaseRDD: RDD[(ImmutableBytesWritable, Result)] =
spark.sparkContext.newAPIHadoopRDD(hbaseConfig,
classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])
//缓存数据
hbaseRDD.cache()
//转DataFrame (rowKey column:qulifater->value)
if (structFieldList.isEmpty) {
import spark.sqlContext.implicits._
val df = hbaseRDD.mapPartitions { item =>
item.map {
case (_, result: Result) =>
val rowKey: String = Bytes.toString(result.getRow)
val families = result.getNoVersionMap.keySet().iterator()
var columns = Map[String, String]()
while (families.hasNext) {
val family = families.next()
val qualifiers = result.getFamilyMap(family).keySet().iterator()
while (qualifiers.hasNext) {
val qualifier = qualifiers.next()
columns += ((Bytes.toString(family) + ":" + (Bytes.toString(qualifier)) -> Bytes.toString(result.getValue(family, qualifier))))
}
}
(rowKey, columns)
}
}.toDF("HBaseRowKey", "columns")
df
} else {
val schema: StructType = StructType(structFieldList)
val rddRow: RDD[Row] = hbaseRDD.mapPartitions { part => {
part.map {
case (_, result: Result) => {
var rowSeq: Seq[Any] = Seq[Any]()
val rowKey: String = Bytes.toString(result.getRow)
rowSeq :+= rowKey
schema.foreach(item => {
val name: String = item.name
if (name.contains(":")) {
val family: String = name.split(":")(0)
val qualifier: String = name.split(":")(1)
val tempValue:Array[Byte] = result.getValue(Bytes.toBytes(family), Bytes.toBytes(qualifier))
if (!tempValue.isEmpty) {
val value = item.dataType.typeName match {
case "string" => Bytes.toString(tempValue)
case "integer" => Bytes.toInt(tempValue)
case "long" => Bytes.toLong(tempValue)
case "double" => Bytes.toDouble(tempValue)
case "float" => Bytes.toFloat(tempValue)
case _ => Bytes.toString(tempValue)
}
rowSeq :+= value
}
}
})
Row.fromSeq(rowSeq)
}
}
}
}
val rowKey: StructField = StructField.apply("HBaseRowKey", StringType, false)
val newSchema: StructType = StructType(schema.+:(rowKey))
spark.createDataFrame(rddRow, newSchema)
}
}
2.写Hbase后续发...