数据准备:
mktest:mk3
.... 95021 column=user:age, timestamp=1554208964508, value=17 95021 column=user:dept, timestamp=1554208964508, value=MA 95021 column=user:name, timestamp=1554208964508, value=\xE5\x91\xA8\xE4\xBA\x8C 95021 column=user:sex, timestamp=1554208964508, value=\xE7\x94\xB7 95022 column=user:age, timestamp=1554208964508, value=20 95022 column=user:dept, timestamp=1554208964508, value=MA 95022 column=user:name, timestamp=1554208964508, value=\xE9\x83\x91\xE6\x98\x8E 95022 column=user:sex, timestamp=1554208964508, value=\xE7\x94\xB7
需求:按照dept分组,求每个部门的平均年龄
1.MapReduce程序设计
1)Map端
自定义类继承
TableMapper(HBASE提供的)
package com.mycat.hdemo.hbase2hdfs;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import java.io.IOException;
public class HBase2HdfsMapper extends TableMapper<Text, IntWritable> {// 部门作为key,年龄作为聚合列
private Text mk=new Text();
private IntWritable mv=new IntWritable();
@Override
protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {
Cell[] cells = value.rawCells();
for (Cell c : cells) {
String name = Bytes.toString(c.getQualifierArray(), c.getQualifierOffset(), c.getQualifierLength());
String val = Bytes.toString(c.getValueArray(), c.getValueOffset(), c.getValueLength());
if(name.equals("dept")){
mk.set(val);
}
if(name.equals("age")){
mv.set(Integer.parseInt(val));
}
}
context.write(mk,mv);
}
}
2)Reduce端
自定义类继承自
Reduce
类,TableMapReduceUtil
为HBASE提供的API,用来封装HBASE的输入与输出
package com.mycat.hdemo.hbase2hdfs;
import com.sun.org.apache.xpath.internal.operations.String;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class HBase2HdfsReducer extends Reducer<Text, IntWritable,Text, NullWritable> {// 制表符分割输出
private Text mk=new Text();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum=0; // 求每组(dept)总和
int count=0; //计数
for (IntWritable value : values) {
sum += value.get();
count++;
}
double avg=sum*1.0/count;
mk.set(key.toString()+"\t"+Double.toString(avg));
context.write(mk,NullWritable.get());
}
}
3)Driver类
package com.mycat.hdemo.hbase2hdfs;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class HBase2HdfsDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
System.setProperty("HADOOP_USER_NAME","hadoop");
Configuration conf=new Configuration();
conf.set("fs.defaultFS","hdfs://mkmg/");
conf.set("hbase.zookeeper.quorum","mycat01:2181,mycat02:2181,mycat03:2181");
Job job = Job.getInstance(conf);
job.setJarByClass(HBase2HdfsDriver.class);
job.setReducerClass(HBase2HdfsReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
Scan scan=new Scan();
TableMapReduceUtil.initTableMapperJob("mktest:mk3",scan,HBase2HdfsMapper.class, Text.class, IntWritable.class,job,false);
FileSystem fs = FileSystem.get(conf);
Path out=new Path("/mktest/mk4out");
if(fs.exists(out)){
fs.delete(out,true);
}
FileOutputFormat.setOutputPath(job,out);
job.waitForCompletion(true);
}
}
2.结果输出
[hadoop@mycat01 stu]$ hdfs dfs -cat /mktest/mk4out/part-r-00000
CS 20.0
IS 19.166666666666668
MA 19.0