上节的MapReduce计算WordCount例子是从hdfs读输入文件,计算结果也写入hdfs
MapReduce分布式计算的输入输出可以根据需要从hdfs或hbase读取或写入,如
A.读hdfs-->写hdfs
B.读hdfs-->写hbase
C.读hbase-->写hdfs
D.读hbase -->写hbase
本节示例第三种和第四种情况。
一、第四种情况,读hbase,mapreduce计算后写入hbase,其中mapreduce计算什么也没做,只是把读出的再写入到输出
网上找的程序,修改调试通过,记录备用
创建 "Map/Reduce Project"项目, 需要注意的是把hbaseXXX/lib 下的jar引入到项目引用的库中来。
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
public class RwHbasetoHbase{
public static class readMapper extends TableMapper<Text, Put> { // mapper classs
public void map(ImmutableBytesWritable row, Result columns,
Context context) throws IOException, InterruptedException {
Text mapoutputkey = new Text();
String rowkey = Bytes.toString(row.get()); // output key
mapoutputkey.set(rowkey);
Put put = new Put(row.get()); // output value
for (Cell cell : columns.rawCells()) { // put column family
if ("info".equals(Bytes.toString(CellUtil.cloneFamily(cell)))) { // put column
if ("name".equals(Bytes.toString(CellUtil //column name
.cloneQualifier(cell)))) {
put.add(cell);
}
if ("age".equals(Bytes.toString(CellUtil //column age
.cloneQualifier(cell)))) {
put.add(cell);
}
if ("class".equals(Bytes.toString(CellUtil //column class
.cloneQualifier(cell)))) {
put.add(cell);
}
}
}
context.write(mapoutputkey, put);
}
}
// reducer class
public static class writereducer extends
TableReducer<Text, Put, ImmutableBytesWritable> {
protected void reduce(Text key, Iterable<Put> value, Context context)
throws IOException, InterruptedException {
for (Put put : value) {
context.write(null, put); // write format
}
}
}
// driver
public static void main(String[] args) throws Exception {{
Configuration cf=new Configuration();
cf.set("hbase.zookeeper.quorum", "centos7");
Configuration config = HBaseConfiguration.create(cf);
Job job = Job.getInstance(config, "hbase_read2write");
job.setJarByClass(RwHbasetoHbase.class); // class that contains mapper and reducer
Scan scan = new Scan();
scan.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs
scan.setCacheBlocks(false); // don't set to true for MR jobs set other scan attrs
TableMapReduceUtil.initTableMapperJob( //set mapper
"student", // input table
scan, // Scan instance to control CF and attribute selection
readMapper.class, // mapper class
Text.class, // mapper output key
Put.class, // mapper output value
job);
TableMapReduceUtil.initTableReducerJob( //set reducer
"student_copy", // output table // table
writereducer.class, // reducer class
job);
job.setNumReduceTasks(1); // at least one, adjust as required
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
}
其中 输入文件student创建见我的上篇文章
https://blog.csdn.net/oLinBSoft/article/details/84337229 学习笔记:从0开始学习大数据-7.hbase java编程hello world
输出文件 student_copy 如下:
二,第三种情况,从hbase作为输入源,读取数据 mapreduce计算结果写入到hdfs文件
也是网上查找的例子,从hdbase 的hello表中读取单词,统计后排序,取词频最高的三个结果写入到hdfs文件系统
1.创建输入源'hello'表
2.如上面一样,在项目中增加一个类 WordCountfromHbase.java
import java.io.IOException;
import java.util.Comparator;
import java.util.TreeMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class WordCountfromHbase {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
String tablename = "hello";
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.quorum", "centos7");
Job job = Job.getInstance(conf, "WordCountHbaseReader");
job.setJarByClass(WordCountfromHbase.class);
Scan scan = new Scan();
TableMapReduceUtil.initTableMapperJob(tablename,scan,doMapper.class, Text.class, IntWritable.class, job);
job.setReducerClass(WordCountHbaseReaderReduce.class);
FileOutputFormat.setOutputPath(job, new Path(args[0]));
MultipleOutputs.addNamedOutput(job, "hdfs", TextOutputFormat.class, WritableComparable.class, Writable.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
public static class doMapper extends TableMapper<Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
@Override
protected void map(ImmutableBytesWritable key, Result value,
Context context) throws IOException, InterruptedException {
/*不进行分隔,将value整行全部获取
String rowValue = Bytes.toString(value.list().get(0).getValue());
context.write(new Text(rowValue), one);
*/
String[] rowValue = Bytes.toString(value.list().get(0).getValue()).split(" ");
for (String str: rowValue){
word.set(str);
context.write(word,one);
}
}
}
public static final int K = 3;
public static class WordCountHbaseReaderReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
//定义treeMap来保持统计结果,由于treeMap是按key升序排列的,这里要人为指定Comparator以实现倒排
private TreeMap<Integer, String> treeMap = new TreeMap<Integer, String>(new Comparator<Integer>() {
@Override
public int compare(Integer x, Integer y) {
return y.compareTo(x);
}
});
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
//reduce后的结果放入treeMap,而不是向context中记入结果
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
if (treeMap.containsKey(sum)){
String value = treeMap.get(sum) + "," + key.toString();
treeMap.put(sum,value);
}else {
treeMap.put(sum, key.toString());
}
if(treeMap.size() > K) {
treeMap.remove(treeMap.lastKey());
}
}
protected void cleanup(Context context) throws IOException, InterruptedException {
//将treeMap中的结果,按value-key顺序写入contex中
for (Integer key : treeMap.keySet()) {
context.write(new Text(treeMap.get(key)), new IntWritable(key));
}
}
}
}
3. 新建一个运行配置,输入需要的参数,即mapreduce结果输出hdfs目录
4.运行后查看hdfs目录文件检验结果
因为map函数中以空格分隔单词,所以逗号出现在分词结果中