Hbase的Buck Loading操作
package com.shujia.bulkloading;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.RegionLocator;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.hbase.mapreduce.Import;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.hbase.mapreduce.SimpleTotalOrderPartitioner;
import org.apache.hadoop.hbase.regionserver.throttle.PressureAwareThroughputController;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MRBench;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class BulkLoadingDemo {
/**
*输出类型要求:
* 1.key为RowKey 所以类型必须包含Bytes 并且RowKey是支持排序
*
*/
public static class BulkLoadingMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, KeyValue>{
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, ImmutableBytesWritable, KeyValue>.Context context) throws IOException, InterruptedException {
String[] line = value.toString().split(",");
String rowKey =line[0]+line[1];
String county=line[2];
String x=line[3];
String y=line[4];
/**
* KeyValue(final byte [] row,final byte [] family,
* final byte [] qualifier,final byte [] value)
*/
KeyValue county_kv = new KeyValue(
Bytes.toBytes(rowKey)
, "info".getBytes()
, "county".getBytes()
,Bytes.toBytes(county)
);
KeyValue x_kv = new KeyValue(
Bytes.toBytes(rowKey)
, "info".getBytes()
, "x".getBytes()
,Bytes.toBytes(x)
);
KeyValue y_kv = new KeyValue(
Bytes.toBytes(rowKey)
, "info".getBytes()
, "y".getBytes()
,Bytes.toBytes(y)
);
context.write(new ImmutableBytesWritable(Bytes.toBytes(rowKey)),county_kv);
context.write(new ImmutableBytesWritable(Bytes.toBytes(rowKey)),x_kv);
context.write(new ImmutableBytesWritable(Bytes.toBytes(rowKey)),y_kv);
}
}
public static void main(String[] args) throws Exception {
//基本配置
Configuration conf = new Configuration();
conf.set("hbase.zookeeper.quorum","node1:2181,node2:2181,master:2181");
Job job = Job.getInstance(conf);
job.setJobName("BulkLoadingDemo");
job.setJarByClass(BulkLoadingDemo.class);
//设置Mapper和Reducer
job.setMapperClass(BulkLoadingMapper.class);
//设置分区排序操作
job.setPartitionerClass(SimpleTotalOrderPartitioner.class);
//设置输入输出路径
TextInputFormat.addInputPath(job,new Path("/data/DIANXIN.csv"));
FileOutputFormat.setOutputPath(job,new Path("/data/dianxin_bulk/"));
//Reducer是否需要
//RowKey写入时需要按照全局有序进行数据写入
job.setReducerClass(Import.KeyValueReducer.class);
Connection hbaseConf = ConnectionFactory.createConnection(conf);
RegionLocator regionLocator = hbaseConf.getRegionLocator(TableName.valueOf("dianxin_bulk"));
//需要先创建Hbase表
Table dianxin_bulk = hbaseConf.getTable(TableName.valueOf("dianxin_bulk"));
//create 'dianxin_bulk','info'
HTableDescriptor tableDescriptor = dianxin_bulk.getTableDescriptor();
//将数据转换为HFile格式
HFileOutputFormat2.configureIncrementalLoad(job,tableDescriptor,regionLocator);
job.waitForCompletion(true);
//
LoadIncrementalHFiles incrementalHFiles = new LoadIncrementalHFiles(conf);
incrementalHFiles.doBulkLoad(new Path("/data/dianxin_bulk/"),hbaseConf.getAdmin(),dianxin_bulk,regionLocator);
}
}
打包上传