HBase与MapReduce操作-HBase读出到HDFS笔记:
直接上代码:笔记都在代码注释里了
驱动类:Hbase_Hdfs_Driver
package com.zk.mr_hbase_hdfs;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.IOException;
public class Hbase_Hdfs_Driver extends Configured implements Tool {
public static void main(String[] args) {
try {
int run = ToolRunner.run(new Hbase_Hdfs_Driver(), args);
System.exit(run);
} catch (Exception e) {
e.printStackTrace();
}
}
public int run(String[] arg0) throws Exception {
Configuration conf = HBaseConfiguration.create();
conf.set("fs.defaultFS", "hdfs://ducking:9000/");
conf.set("hbase.zookeeper.quorum", "ducking:2181,admin:2181,xmh:2181");
System.setProperty("HADOOP_USER_NAME", "root"); //windows用户设置为Hadoop的root用户
FileSystem fs = FileSystem.get(conf);
Job job = Job.getInstance(conf);
//设置主驱动类
job.setJarByClass(Hbase_Hdfs_Driver.class);
//在scan内设置扫描条件
Scan scan = new Scan();
scan.setCaching(50); //设置缓存行数:较高的缓存值将启用更快的扫描程序,但将使用更多的内存。
//设置Mapper,注意导入的是mapreduce包下的,不是mapred包下的,后者是老版本
TableMapReduceUtil.initTableMapperJob(
"users".getBytes(), // 指定表名
scan, // 指定扫描数据的条件
Hbase_Hdfs_Mapper.class, // 指定mapper class
Text.class, // outputKeyClass mapper阶段的输出的key的类型
Text.class, // outputValueClass mapper阶段的输出的value的类型
job// job对象
);
//设置Reduce数量,设为零节约资源
job.setNumReduceTasks(0);
Path outputPath = new Path("hdfs://ducking:9000/hdfs_hbase/output");
//判断输出路径是否存在 存在则删除
if(fs.exists(outputPath)) {
fs.delete(outputPath,true);
}
FileOutputFormat.setOutputPath(job, outputPath);
boolean isDone = job.waitForCompletion(true);
if(!isDone){
throw new IOException("Job running with error");
}
return isDone ? 0 : 1;
}
}
Mapper类:Hbase_Hdfs_Mapper
package com.zk.mr_hbase_hdfs;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import java.io.IOException;
public class Hbase_Hdfs_Mapper extends TableMapper<Text, Text> {
private Text k = new Text();
private Text v = new Text();
private StringBuffer sb = new StringBuffer();
//ImmutableBytesWritable key 从hbase中读取的key是 行键RowKey
//Result value 一个key对应多个cell 封装在value里
@Override
protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {
//1、先处理key也就是把rowkey搞出来
//需要对key做处理
String[] s = key.toString().split(" ");
String k1 = "";
for (String s1 : s) {
k1 += s1.substring(1);
}
k.set(k1);
//2、把value里的cell值拼接起来 放入v里面
sb.setLength(0);//清空sb
Cell[] cells = value.rawCells();//取出未更改过的cells
for (Cell cell : cells) {
String s1 = Bytes.toString(CellUtil.cloneValue(cell));//将cell里的值value克隆出来,并转换为string
sb.append(s1).append(",");//逐个把cell里的值放入sb里,并用","隔开
}
//放入v里面
v.set(sb.toString());
//输出 因为这里不做数据计算 直接写入hdfs里
context.write(k,v);
}
}