HBase Bulk Loading的两种方法

最新推荐文章于 2023-06-15 10:03:21 发布

Yu Liebing

最新推荐文章于 2023-06-15 10:03:21 发布

阅读量524

点赞数

分类专栏：分布式系统文章标签： spark hbase 大数据 hadoop

本文链接：https://blog.csdn.net/qq_36822400/article/details/104887450

版权

分布式系统专栏收录该内容

5 篇文章 0 订阅

订阅专栏

关于HBase bulk loading原理的详细解析及完整案例可参考HBase Bulk Loading: What, Why and How

通过MapReduce进行bulk loading存在效率低下的问题, 如果数据预处理无法在一个Job中完成, 还需要分解为多个Job. 本文介绍使用Spark进行HBase bulk loading的两种方法.

基于Spark的bulk loading

本方法适用于HBase 1.4.x版本.

object SparkBulkLoad {
  def main(args: Array[String]): Unit = {
    if (args.length < 1) {
      println("Usage: SparkBulkLoad + {stagingFolder}")
    }

    val tableName = "bulkload-table-test"
    val columnFamily = "f"
    val stagingFolder = args(0)

    // get table info
    val hConf = HBaseConfiguration.create()
    val connection = ConnectionFactory.createConnection(hConf)
    val table = connection.getTable(TableName.valueOf(tableName))
    val regionLocator = connection.getRegionLocator(TableName.valueOf(tableName))

    val sConf = new SparkConf().setAppName("bulkload").setMaster("local")
    sConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    val sc = new SparkContext(sConf)
    // simple data to bulk load to hbase
    val rdd = sc.parallelize(Array(
      (Bytes.toBytes("1"),
        Array((Bytes.toBytes(columnFamily), Bytes.toBytes("4"), Bytes.toBytes("1")))),
      (Bytes.toBytes("4"),
        Array((Bytes.toBytes(columnFamily), Bytes.toBytes("2"), Bytes.toBytes("2")))),
      (Bytes.toBytes("2"),
        Array((Bytes.toBytes(columnFamily), Bytes.toBytes("8"), Bytes.toBytes("3")))),
      (Bytes.toBytes("3"),
        Array((Bytes.toBytes(columnFamily), Bytes.toBytes("9"), Bytes.toBytes("4")))),
      (Bytes.toBytes("5"),
        Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5"))))
    )).repartition(2) // random partition to simulate a real scene
    // parse the source data and repartition and sort by row key, family, qualify
    val sortedRdd = rdd.map(line => {
      val keyFamilyQualifier = new KeyFamilyQualifier(line._1, line._2(0)._1, line._2(0)._2)
      val value = line._2(0)._3
      (keyFamilyQualifier, value)
    }).repartitionAndSortWithinPartitions(new BulkLoadPartitioner(regionLocator.getStartKeys))
    // reformat the data so that we can save as HFileOutputFormat2
    val hfileRdd = sortedRdd.map(line => {
        val rowKey = new ImmutableBytesWritable(line._1.getRowKey)
        val keyValue = new hbase.KeyValue(line._1.getRowKey, line._1.getFamily, line._1.getQualifier, line._2)
        (rowKey, keyValue)
      })
    // save the rdd as hfile
    hfileRdd.saveAsNewAPIHadoopFile(
      stagingFolder,
      classOf[ImmutableBytesWritable],
      classOf[KeyValue],
      classOf[HFileOutputFormat2], hConf)
    // load the hfile from hdfs
    val loader = new LoadIncrementalHFiles(hConf)
    loader.doBulkLoad(new Path(stagingFolder), connection.getAdmin, table, regionLocator)
  }
}

其中KeyFamilyQualifier和BulkLoadPartitioner的定义如下：

public class KeyFamilyQualifier implements WritableComparable<KeyFamilyQualifier> {

  private byte[] rowKey;
  private byte[] family;
  private byte[] qualifier;

  public KeyFamilyQualifier(byte[] rowKey, byte[] family, byte[] qualifier) {
    this.rowKey = rowKey;
    this.family = family;
    this.qualifier = qualifier;
  }

  @Override
  public int compareTo(@Nonnull KeyFamilyQualifier other) {
    int rowCmp = Bytes.compareTo(this.rowKey, other.rowKey);
    if (rowCmp == 0) {
      int familyCmp = Bytes.compareTo(this.family, other.family);
      if (familyCmp == 0) {
        return Bytes.compareTo(this.family, other.family);
      }
      return rowCmp;
    }
    return rowCmp;
  }

  @Override
  public void write(DataOutput dataOutput) throws IOException {
    dataOutput.writeInt(rowKey.length); dataOutput.write(rowKey);
    dataOutput.writeInt(family.length); dataOutput.write(family);
    dataOutput.writeInt(qualifier.length); dataOutput.write(qualifier);
  }

  @Override
  public void readFields(DataInput dataInput) throws IOException {
    int rowKeyLength = dataInput.readInt();
    this.rowKey = new byte[rowKeyLength];
    dataInput.readFully(this.rowKey, 0, rowKeyLength);

    int familyLength = dataInput.readInt();
    this.family = new byte[familyLength];
    dataInput.readFully(this.family, 0, familyLength);

    int qualifierLength = dataInput.readInt();
    this.qualifier = new byte[qualifierLength];
    dataInput.readFully(this.qualifier, 0, qualifierLength);
  }

  public byte[] getRowKey() {
    return rowKey;
  }

  public byte[] getFamily() {
    return family;
  }

  public byte[] getQualifier() {
    return qualifier;
  }
}

class BulkLoadPartitioner(startKeys:Array[Array[Byte]])
  extends Partitioner {
  // when table not exist, startKeys = Byte[0][]
  override def numPartitions: Int = if (startKeys.length == 0) 1 else startKeys.length

  override def getPartition(key: Any): Int = {

    val comparator: Comparator[Array[Byte]] = new Comparator[Array[Byte]] {
      override def compare(o1: Array[Byte], o2: Array[Byte]): Int = {
        Bytes.compareTo(o1, o2)
      }
    }

    val rowKey:Array[Byte] =
      key match {
        case qualifier: KeyFamilyQualifier =>
          qualifier.getRowKey
        case _ =>
          key.asInstanceOf[Array[Byte]]
      }
    var partition = util.Arrays.binarySearch(startKeys, rowKey, comparator)
    if (partition < 0)
      partition = partition * -1 + -2
    if (partition < 0)
      partition = 0
    partition
  }
}

基于HBase Connectors的Bulk Loading

在HBase 2.2.x版本中HBase官方提供了HBase Connectors.可更方便地利用Spark进行bulk load. 详情见HBase Bulk Loading: What, Why and How. Java案例如下.

final public class JavaHBaseBulkLoadExample {
  private JavaHBaseBulkLoadExample() {}

  public static void main(String[] args) {
    if (args.length < 1) {
      System.out.println("JavaHBaseBulkLoadExample  " + "{outputPath}");
      return;
    }

    String tableName = "bulkload-table-test";
    String columnFamily1 = "f1";
    String columnFamily2 = "f2";

    SparkConf sparkConf = new SparkConf().setAppName("JavaHBaseBulkLoadExample " + tableName);
    JavaSparkContext jsc = new JavaSparkContext(sparkConf);

    try {
      List<String> list= new ArrayList<String>();
      // row1
      list.add("1," + columnFamily1 + ",b,1");
      // row3
      list.add("3," + columnFamily1 + ",a,2");
      list.add("3," + columnFamily1 + ",b,1");
      list.add("3," + columnFamily2 + ",a,1");
      /* row2 */
      list.add("2," + columnFamily2 + ",a,3");
      list.add("2," + columnFamily2 + ",b,3");

      JavaRDD<String> rdd = jsc.parallelize(list);

      Configuration conf = HBaseConfiguration.create();
      JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf);



      hbaseContext.bulkLoad(rdd, TableName.valueOf(tableName),new BulkLoadFunction(), args[0],
          new HashMap<byte[], FamilyHFileWriteOptions>(), false, HConstants.DEFAULT_MAX_FILE_SIZE);
    } finally {
      jsc.stop();
    }
  }

  public static class BulkLoadFunction
          implements Function<String, Pair<KeyFamilyQualifier, byte[]>> {
    @Override
    public Pair<KeyFamilyQualifier, byte[]> call(String v1) throws Exception {
      if (v1 == null) {
        return null;
      }

      String[] strs = v1.split(",");
      if(strs.length != 4) {
        return null;
      }

      KeyFamilyQualifier kfq = new KeyFamilyQualifier(Bytes.toBytes(strs[0]),
              Bytes.toBytes(strs[1]), Bytes.toBytes(strs[2]));
      return new Pair(kfq, Bytes.toBytes(strs[3]));
    }
  }
}