通用mr,hbase表导入hbase

最新推荐文章于 2022-09-06 00:00:00 发布

abc_321a

最新推荐文章于 2022-09-06 00:00:00 发布

阅读量957

点赞数

版权声明：本文为博主原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接和本声明。

本文链接：https://blog.csdn.net/abc_321a/article/details/53234445

版权

1、创建两个表table1,table2

create 'table2',{NAME => 'cf1',VERSIONS => 3},{NAME => 'cf2',VERSIONS => 3}
create 'table1',{NAME => 'cf',VERSIONS => 3}

2、向表table1中插入数据

hbase(main):026:0> put 'table1','1','cf:c1','31'
0 row(s) in 0.0320 seconds

hbase(main):027:0> put 'table1','1','cf:c1','30'
0 row(s) in 0.0100 seconds

hbase(main):028:0> put 'table1','1','cf:c2','male'
0 row(s) in 0.0080 seconds

hbase(main):029:0> put 'table1','2','cf:c1','31'
0 row(s) in 0.0080 seconds

hbase(main):030:0> put 'table1','2','cf:c2','female'
0 row(s) in 0.0090 seconds

hbase(main):031:0> put 'table1','3','cf:c1','28'
0 row(s) in 0.0110 seconds

hbase(main):032:0> put 'table1','3','cf:c2','female'
0 row(s) in 0.0170 seconds

hbase(main):033:0> put 'table1','4','cf:c1','29'
0 row(s) in 0.0090 seconds

hbase(main):034:0> put 'table1','4','cf:c2','male'
0 row(s) in 0.0140 seconds

3、

1、使用mr程序将table1表中的数据导入到table2表中

在代码中，设置了6个参数，分别为两个表的表名table1,table2,table1表的要导出的列簇名fromFamily,列名fromQualifier,table2表要导入的列簇名toFamily,列名toQualifier，这样只要传入参数就可以实现选择列进行导出（fromtable）以及选择列进行导入（totable）。同时在代码中设置最大版本数就可以把table1中所有版本的数据导出

package demo;

import java.io.IOException;

import org.apache.hadoop.hbase.Cell;

importorg.apache.hadoop.hbase.KeyValue;

import org.apache.hadoop.hbase.client.Put;

import org.apache.hadoop.hbase.client.Result;

import org.apache.hadoop.hbase.io.ImmutableBytesWritable;

import org.apache.hadoop.hbase.mapreduce.TableMapper;

import org.apache.hadoop.hbase.util.Bytes;

publicclassH2HMapperextendsTableMapper<ImmutableBytesWritable,Put>{

private StringfromFamily = null;

private StringfromQualifier =null;

private StringtoFamily = null;

private StringtoQualifier =null;

@Override

protectedvoid setup(Contextcontext) throws IOException,InterruptedException {

fromFamily =context.getConfiguration().get("fromfamily");

fromQualifier =context.getConfiguration().get("fromqualifier");

toFamily =context.getConfiguration().get("tofamily");

toQualifier =context.getConfiguration().get("toqualifier");

}

@Override

protectedvoidmap(ImmutableBytesWritablerowkey, Resultcolumns,Contextcontext)

throws IOException,InterruptedException {

Putput=newPut(rowkey.get());

for (Cellkv : columns.rawCells()) {

System.out.println(Bytes.toStringBinary(kv.~~getQualifier~~()));

if(Bytes.toString(kv.~~getFamily~~()).equals(fromFamily) && Bytes.toStringBinary(kv.~~getQualifier~~()).equals(fromQualifier)){

System.out.println("++++++++++++"+fromQualifier);

put.addColumn(Bytes.toBytes(toFamily), Bytes.toBytes(toQualifier),kv.~~getValue~~());

context.write(rowkey,put);

}

}

}

}

package demo;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.conf.Configured;

import org.apache.hadoop.hbase.client.Put;

import org.apache.hadoop.hbase.client.Scan;

import org.apache.hadoop.hbase.io.ImmutableBytesWritable;

import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;

import org.apache.hadoop.hbase.util.Bytes;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

publicclassDriverextendsConfiguredimplementsTool{

privatestaticfinal StringFROMTABLE="";

privatestaticfinal StringTOTABLE="";

@Override

publicint run(String[]args) throws Exception {

byte[]family = Bytes.toBytes(args[2]);

byte[]qualifier = Bytes.toBytes(args[3]);

Scanscan=new Scan();

scan.setMaxVersions();

scan.addColumn(family,qualifier);

Configurationconf=getConf();

conf.set(FROMTABLE,args[0]);

conf.set(TOTABLE,args[1]);

conf.set("fromfamily",args[2]);

conf.set("fromqualifier",args[3]);

conf.set("tofamily",args[4]);

conf.set("toqualifier",args[5]);

StringjobName="From table"+FROMTABLE+",Import to"+TOTABLE;

Jobjob= Job.getInstance(conf,jobName);

job.setJarByClass(Driver.class);

job.setNumReduceTasks(0);

TableMapReduceUtil.initTableMapperJob(

args[0],

scan,

H2HMapper.class,

ImmutableBytesWritable.class,

Put.class,

job);

TableMapReduceUtil.initTableReducerJob(args[1],null,job);

returnjob.waitForCompletion(true)?0:1;

}

publicstaticvoid main(String[]args) throws Exception {

args=new String[]{

"table1",

"table2",

"cf",

"c1",

"cf1",

"c3"

};

ToolRunner.run(getConfiguration(),newDriver(),args);

}

privatestatic Configurationconfiguration;

publicstatic ConfigurationgetConfiguration(){

if(configuration==null){

configuration=newConfiguration();

configuration.setBoolean("mapreduce.app-submission.cross-platform",true);//配置使用跨平台提交任务

configuration.set("fs.defaultFS","hdfs://master:8020");//指定namenode

configuration.set("mapreduce.framework.name","yarn");//指定使用yarn框架

configuration.set("yarn.resourcemanager.address","master:8032");// 指定resourcemanager

configuration.set("yarn.resourcemanager.scheduler.address","master:8030");//指定资源分配器

configuration.set("mapreduce.jobhistory.address","master:10020");//指定historyserver

configuration.set("hbase.master","master:16000");

configuration.set("hbase.rootdir","hdfs://master:8020/hbase");

configuration.set("hbase.zookeeper.quorum","slave1,slave2,slave3");

configuration.set("hbase.zookeeper.property.clientPort","2181");

configuration.set("mapreduce.job.jar","C:\\Users\\Administrator\\Desktop\\hbase1.jar");//设置jar包路径

}

returnconfiguration;

}

}

如果不设置版本数，即把scan.setMaxVersions()这句话注释掉，那么table1表导出的将是最新版本的数据。从第2步向table1中插入的数据可知，table1表中列簇名为cf，列名为c1的数据总共是5条，但是程序运行map的输出是4，说明程序并没有把table1表所有版本的数据都输出

Map-Reduce Framework
       Map input records=4
       Map output records=4
       Input split bytes=64
       Spilled Records=0
       Failed Shuffles=0
       Merged Map outputs=0
       GC time elapsed (ms)=169
       CPU time spent (ms)=2560
       Physical memory (bytes) snapshot=120504320
       Virtual memory (bytes) snapshot=848523264
       Total committed heap usage (bytes)=16130048

此时查看table2表中的数据，发现table2表也只有4条数据

hbase(main):081:0> scan 'table2'
ROW                                              COLUMN+CELL
1                                               column=cf1:c3, timestamp=1477596015451, value=30
2                                               column=cf1:c3, timestamp=1477596015451, value=31
3                                               column=cf1:c3, timestamp=1477596015451, value=28
4                                               column=cf1:c3, timestamp=1477596015451, value=29
4 row(s) in 0.0670 seconds

如果设置版本数，就把scan.setMaxVersions()这句话加上，那么table1表导出的将是所有版本的数据。此时程序运行map的输出是5，说明程序把table1表所有版本的数据都输出

Map-Reduce Framework
       Map input records=4
       Map output records=5
       Input split bytes=64
       Spilled Records=0
       Failed Shuffles=0
       Merged Map outputs=0
       GC time elapsed (ms)=187
       CPU time spent (ms)=2500
       Physical memory (bytes) snapshot=122474496
       Virtual memory (bytes) snapshot=848461824
       Total committed heap usage (bytes)=16130048

此时我们再看看table2表中的数据

hbase(main):086:0> scan 'table2'
ROW                                              COLUMN+CELL
1                                               column=cf1:c3, timestamp=1477596557735, value=31
2                                               column=cf1:c3, timestamp=1477596557735, value=31
3                                               column=cf1:c3, timestamp=1477596557735, value=28
4                                               column=cf1:c3, timestamp=1477596557735, value=29
4 row(s) in 0.0430 seconds

依然只有4条数据，这是为什么呢，我的理解是这样的，从截图的结果我们可以看到，所有数据的timestamp的值都是一样的，而行健+列名+时间戳确定唯一的一个cell,即每个cell里面只能存储一个值，所以table2表只能接收table1表最新版本的数据，那要怎样才能让table2表接收table1表所有版本的数据呢，其实只要将

put.addColumn(Bytes.toBytes(toFamily),Bytes.toBytes(toQualifier), kv.getValue());

改成

put.addColumn(Bytes.toBytes(toFamily),Bytes.toBytes(toQualifier), kv.getTimestamp(),kv.getValue());即可

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。