1、 创建两个表table1,table2
create 'table2',{NAME => 'cf1',VERSIONS => 3},{NAME => 'cf2',VERSIONS => 3} create 'table1',{NAME => 'cf',VERSIONS => 3} |
2、 向表table1中插入数据
hbase(main):026:0> put 'table1','1','cf:c1','31' 0 row(s) in 0.0320 seconds hbase(main):027:0> put 'table1','1','cf:c1','30' 0 row(s) in 0.0100 seconds hbase(main):028:0> put 'table1','1','cf:c2','male' 0 row(s) in 0.0080 seconds hbase(main):029:0> put 'table1','2','cf:c1','31' 0 row(s) in 0.0080 seconds hbase(main):030:0> put 'table1','2','cf:c2','female' 0 row(s) in 0.0090 seconds hbase(main):031:0> put 'table1','3','cf:c1','28' 0 row(s) in 0.0110 seconds hbase(main):032:0> put 'table1','3','cf:c2','female' 0 row(s) in 0.0170 seconds hbase(main):033:0> put 'table1','4','cf:c1','29' 0 row(s) in 0.0090 seconds hbase(main):034:0> put 'table1','4','cf:c2','male' 0 row(s) in 0.0140 seconds |
3、
1、 使用mr程序将table1表中的数据导入到table2表中
在代码中,设置了6个参数,分别为两个表的表名table1,table2,table1表的要导出的列簇名fromFamily,列名fromQualifier,table2表要导入的列簇名toFamily,列名toQualifier,这样只要传入参数就可以实现选择列进行导出(fromtable)以及选择列进行导入(totable)。同时在代码中设置最大版本数就可以把table1中所有版本的数据导出
package demo; import java.io.IOException; import org.apache.hadoop.hbase.Cell; importorg.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.TableMapper; import org.apache.hadoop.hbase.util.Bytes; publicclassH2HMapperextendsTableMapper<ImmutableBytesWritable,Put>{ private StringfromFamily = null; private StringfromQualifier =null; private StringtoFamily = null; private StringtoQualifier =null; @Override protectedvoid setup(Contextcontext) throws IOException,InterruptedException { fromFamily =context.getConfiguration().get("fromfamily"); fromQualifier =context.getConfiguration().get("fromqualifier"); toFamily =context.getConfiguration().get("tofamily"); toQualifier =context.getConfiguration().get("toqualifier"); } @Override protectedvoidmap(ImmutableBytesWritablerowkey, Resultcolumns,Contextcontext) throws IOException,InterruptedException { Putput=newPut(rowkey.get()); for (Cellkv : columns.rawCells()) { System.out.println(Bytes.toStringBinary(kv. if(Bytes.toString(kv. System.out.println("++++++++++++"+fromQualifier); put.addColumn(Bytes.toBytes(toFamily), Bytes.toBytes(toQualifier),kv. context.write(rowkey,put); } } } } |
package demo; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; publicclassDriverextendsConfiguredimplementsTool{ privatestaticfinal StringFROMTABLE=""; privatestaticfinal StringTOTABLE=""; @Override publicint run(String[]args) throws Exception { byte[]family = Bytes.toBytes(args[2]); byte[]qualifier = Bytes.toBytes(args[3]); Scanscan=new Scan(); scan.setMaxVersions(); scan.addColumn(family,qualifier); Configurationconf=getConf(); conf.set(FROMTABLE,args[0]); conf.set(TOTABLE,args[1]); conf.set("fromfamily",args[2]); conf.set("fromqualifier",args[3]); conf.set("tofamily",args[4]); conf.set("toqualifier",args[5]); StringjobName="From table"+FROMTABLE+",Import to"+TOTABLE; Jobjob= Job.getInstance(conf,jobName); job.setJarByClass(Driver.class); job.setNumReduceTasks(0); TableMapReduceUtil.initTableMapperJob( args[0], scan, H2HMapper.class, ImmutableBytesWritable.class, Put.class, job); TableMapReduceUtil.initTableReducerJob(args[1],null,job); returnjob.waitForCompletion(true)?0:1; } publicstaticvoid main(String[]args) throws Exception { args=new String[]{ "table1", "table2", "cf", "c1", "cf1", "c3"
}; ToolRunner.run(getConfiguration(),newDriver(),args); } privatestatic Configurationconfiguration; publicstatic ConfigurationgetConfiguration(){ if(configuration==null){ configuration=newConfiguration(); configuration.setBoolean("mapreduce.app-submission.cross-platform",true);//配置使用跨平台提交任务 configuration.set("fs.defaultFS","hdfs://master:8020");//指定namenode configuration.set("mapreduce.framework.name","yarn");//指定使用yarn框架 configuration.set("yarn.resourcemanager.address","master:8032");// 指定resourcemanager configuration.set("yarn.resourcemanager.scheduler.address","master:8030");//指定资源分配器 configuration.set("mapreduce.jobhistory.address","master:10020");//指定historyserver configuration.set("hbase.master","master:16000"); configuration.set("hbase.rootdir","hdfs://master:8020/hbase"); configuration.set("hbase.zookeeper.quorum","slave1,slave2,slave3"); configuration.set("hbase.zookeeper.property.clientPort","2181"); configuration.set("mapreduce.job.jar","C:\\Users\\Administrator\\Desktop\\hbase1.jar");//设置jar包路径 } returnconfiguration; } } |
如果不设置版本数,即把scan.setMaxVersions()这句话注释掉,那么table1表导出的将是最新版本的数据。从第2步向table1中插入的数据可知,table1表中列簇名为cf,列名为c1的数据总共是5条,但是程序运行map的输出是4,说明程序并没有把table1表所有版本的数据都输出
Map-Reduce Framework Map input records=4 Map output records=4 Input split bytes=64 Spilled Records=0 Failed Shuffles=0 Merged Map outputs=0 GC time elapsed (ms)=169 CPU time spent (ms)=2560 Physical memory (bytes) snapshot=120504320 Virtual memory (bytes) snapshot=848523264 Total committed heap usage (bytes)=16130048 |
此时查看table2表中的数据,发现table2表也只有4条数据
hbase(main):081:0> scan 'table2' ROW COLUMN+CELL 1 column=cf1:c3, timestamp=1477596015451, value=30 2 column=cf1:c3, timestamp=1477596015451, value=31 3 column=cf1:c3, timestamp=1477596015451, value=28 4 column=cf1:c3, timestamp=1477596015451, value=29 4 row(s) in 0.0670 seconds |
如果设置版本数,就把scan.setMaxVersions()这句话加上,那么table1表导出的将是所有版本的数据。此时程序运行map的输出是5,说明程序把table1表所有版本的数据都输出
Map-Reduce Framework Map input records=4 Map output records=5 Input split bytes=64 Spilled Records=0 Failed Shuffles=0 Merged Map outputs=0 GC time elapsed (ms)=187 CPU time spent (ms)=2500 Physical memory (bytes) snapshot=122474496 Virtual memory (bytes) snapshot=848461824 Total committed heap usage (bytes)=16130048 |
此时我们再看看table2表中的数据
hbase(main):086:0> scan 'table2' ROW COLUMN+CELL 1 column=cf1:c3, timestamp=1477596557735, value=31 2 column=cf1:c3, timestamp=1477596557735, value=31 3 column=cf1:c3, timestamp=1477596557735, value=28 4 column=cf1:c3, timestamp=1477596557735, value=29 4 row(s) in 0.0430 seconds |
依然只有4条数据,这是为什么呢,我的理解是这样的,从截图的结果我们可以看到,所有数据的timestamp的值都是一样的,而行健+列名+时间戳确定唯一的一个cell,即每个cell里面只能存储一个值,所以table2表只能接收table1表最新版本的数据,那要怎样才能让table2表接收table1表所有版本的数据呢,其实只要将
put.addColumn(Bytes.toBytes(toFamily),Bytes.toBytes(toQualifier), kv.getValue());
改成
put.addColumn(Bytes.toBytes(toFamily),Bytes.toBytes(toQualifier), kv.getTimestamp(),kv.getValue());即可