HBase本身提供了很多种数据导入的方式,通常有两种常用方式:
使用HBase提供的TableOutputFormat,原理是通过一个Mapreduce作业将数据导入HBase
另一种方式就是使用HBase原生Client API
本文就是示范如何通过MapReduce作业从一个文件读取数据并写入到HBase中。
首先启动Hadoop与HBase,然后创建一个空表,用于后面导入数据:
hbase(main):006:0> create 'mytable','cf'
0 row(s) in 10.8310 seconds=> Hbase::Table - mytable
hbase(main):007:0> list
TABLE
mytable
1 row(s) in 0.1220 seconds=> ["mytable"]
hbase(main):008:0> scan 'mytable'
ROW COLUMN+CELL
0 row(s) in 0.2130 seconds
一、示例程序
下面的示例程序通过TableOutputFormat将HDFS上具有一定格式的文本数据导入到HBase中。
首先创建MapReduce作业,目录结构如下:
Hdfs2HBase/
├── classes
└── src
├── Hdfs2HBase.java
├── Hdfs2HBaseMapper.java
└── Hdfs2HBaseReducer.java
Hdfs2HBaseMapper.java
package com.lisong.hdfs2hbase;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class Hdfs2HBaseMapper extends Mapper {
public void map(LongWritable key, Text line, Context context) throws IOException,InterruptedException {
String lineStr = line.toString();
int index = lineStr.indexOf(":");
String rowkey = lineStr.substring(0, index);
String left = lineStr.substring(index+1);
context.write(new Text(rowkey), new Text(left));
}
}
Hdfs2HBaseReducer.java
package com.lisong.hdfs2hbase;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
public class Hdfs2HBaseReducer extends Reducer {
public void reduce(Text rowkey, Iterable value, Context context) throws IOException,InterruptedException {
String k = rowkey.toString();
for(Text val : value) {
Put put = new Put(k.getBytes());
String[] strs = val.toString().split(":");
String family &#