1.读取hbase上的表,将结果输出到hdfs上
问题是:统计出现班级的次数,也是一个词频统计的案例,但是是从hbase上读取数据,将结果输出到hdfs上,
做出的改动就是需要map端继承不同的类,Driver端设置关联mapper端信息有一些变化
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class MRHbase01 {
/**
* public abstract class TableMapper<KEYOUT, VALUEOUT> extends Mapper<ImmutableBytesWritable, Result, KEYOUT, VALUEOUT>
* TableMapper的k-v输入类型已经被限定了,所以我们只需要写出输出的数据类型
*/
public class ReadHbaseMapper extends TableMapper<Text,IntWritable>{
@Override
protected void map(ImmutableBytesWritable key, Result value, Mapper<ImmutableBytesWritable, Result, Text, IntWritable>.Context context) throws IOException, InterruptedException{
String id = Bytes.toString(key.get());
//由于是统计做班级出现次数的统计,所以这里是获取result中的clazz列下面的内容
String clazz = Bytes.toString(value.getValue("info".getBytes(), "clazz".getBytes()));
context.write(new Text(clazz),new IntWritable(1));
}
}
public class MyReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum=0;
for (IntWritable value : values) {
sum+=value.get();
}
context.write(key,new IntWritable(sum));
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("hbase.zookeeper.quorum","master:2181,node1:2181,node2:2181");
Job job = Job.getInstance(conf);
job.setJarByClass(MRHbase01.class);
job.setReducerClass(MyReducer.class);
//值得注意的是这里设置Mapper类的时候与普通的mapreduce程序不大相同,用
//TableMapRedeceUtil,设置mapper端,当然如果是reducer也可以使用
//我们在这里指定了输出k-v类型和mapper类,普通的mr程序是分开的
TableMapReduceUtil.initTableMapperJob(
TableName.valueOf("student"),
new Scan(),
ReadHbaseMapper.class,
Text.class,
IntWritable.class,
job
);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//判断输出路径是否存在,存在就删除
Path outputPath = new Path("/output");
FileOutputFormat.setOutputPath(job,outputPath);
FileSystem fs=FileSystem.get(conf);
if(fs.exists(outputPath)){
fs.delete(outputPath,true);
}
//提交任务
job.waitForCompletion(true);
}
}
程序写完后需要打包到集群上,但是我们还需要在porm文件中添加一些插件,这样才能识别我们的hbase
<!-- 带依赖jar 插件-->
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
打包后会产生2个压缩包,都给上传了
运行命令:hadoop dfs jar 压缩包名(大的那个) 类名(有package的也要加上不然识别不出来)
hadoop jar HbaseApi-1.0-SNAPSHOT-jar-with-dependencies.jar MRHbase01
输出结果:
补充:
我在写的时候可能是因为idea版本问题,重写了TableMapper方法的时候,Context没有前面的类名,使用最新版本的idea可以
也可以我们自己手动添加上,Ctrl点击TableMapper,进去就可以看看到MapperM<>,
我们把Mapper<ImmutableBytesWritable, Result, KEYOUT, VALUEOUT>复制出来,
再根据自己的输入输出类型进行修改
public abstract class TableMapper<KEYOUT, VALUEOUT> extends Mapper<ImmutableBytesWritable, Result, KEYOUT, VALUEOUT> {
public TableMapper() {
}
}
2.从hbase读取数据,并将结果写hbase
既然是将数据写入到hbase中,我们的准备工作就是创建一个空的表,等待被写入
create 'clazz_count','info';
实现代码
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class MR06Hbase {
/**
* MapReduce读取HBase student表
* 统计班级人数,并将最后的结果保存到hbase
*/
public static class ReadHBaseMapper extends TableMapper<Text, IntWritable> {
@Override
protected void map(ImmutableBytesWritable key, Result value, Mapper<ImmutableBytesWritable, Result, Text, IntWritable>.Context context) throws IOException, InterruptedException {
String clazz = Bytes.toString(value.getValue("info".getBytes(), "clazz".getBytes()));
context.write(new Text(clazz),new IntWritable(1));
}
}
/**
* public abstract class TableReducer<KEYIN, VALUEIN, KEYOUT> extends Reducer<KEYIN, VALUEIN, KEYOUT, Mutation> {
* public TableReducer() {
* }
* }
* TableReducer类后面需要跟上3个内容,输入k-v类型和map端的输出类型保持一致,第四个参数是Mutation
* Mutation有一个子类Put,所以我们的输出类型Value就用Put,输出类型Key不需要
* 在
*/
public static class WriteHbaseReducer extends TableReducer<Text,IntWritable,NullWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum=0;
for (IntWritable value : values) {
sum+=value.get();
}
Put put = new Put(key.getBytes());
put.addColumn("info".getBytes(),"counts".getBytes(),(sum+"").getBytes());
context.write(NullWritable.get(),put);
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.quorum", "master:2181,node1:2181,node2:2181");
Job job = Job.getInstance(conf);
job.setJobName("MR06Hbase");
job.setJarByClass(MR06Hbase.class);
//ctrl+p 查看参数慢慢敲
TableMapReduceUtil.initTableMapperJob(
TableName.valueOf("student"),
new Scan(),
ReadHBaseMapper.class,
Text.class,
IntWritable.class,
job
);
//注意这里第一个参数提示是 :String table,只要传入一个字符串就行了
TableMapReduceUtil.initTableReducerJob(
"clazz_count",
WriteHbaseReducer.class,
job
);
job.waitForCompletion(true);
}
}
打包上传并运行
hadoop jar HbaseApi-1.0-SNAPSHOT-jar-with-dependencies.jar MR06Hbase
运行成功后查看hbase中表的内容