前面用了java api操作了hbase,这里就用mapreduce来操作mapreduce。
需求:对下面的表中的数据进行统计
hbase(main):006:0> scan 'words'
ROW COLUMN+CELL
1 column=info:word, timestamp=1509345324263, value=hadoop,hdfs,mapreduce,hive,hbase
2 column=info:word, timestamp=1509345324263, value=hdfs,hive,hbase,storm,kafka
3 column=info:word, timestamp=1509345324263, value=hbase,storm,kafka,spark,mahout
要求统计结果放入到另外一张表words2中,格式如下
hbase(main):010:0> scan 'words2'
ROW COLUMN+CELL
hadoop column=info:word, timestamp=1509345777813, value=1
hbase column=info:word, timestamp=1509345777813, value=3
hdfs column=info:word, timestamp=1509345777813, value=2
hive column=info:word, timestamp=1509345777813, value=2
kafka column=info:word, timestamp=1509345777813, value=2
mahout column=info:word, timestamp=1509345777813, value=1
mapreduce column=info:word, timestamp=1509345777813, value=1
spark column=info:word, timestamp=1509345777813, value=1
storm column=info:word, timestamp=1509345777813, value=2
注:上面的都是自己做完之后复制的。
1、建立数据来源表‘words’,包含一个列族‘info’
向表中添加数据,在列族中放入列‘word’,并将短文数据放入该列中,如此插入多行,行键为不同的数据即可
2、建立输出表‘words2’,包含一个列族‘info’
3、通过Mr操作Hbase的‘word’表,对‘info:word’中的单词做统计,并将统计结果写入‘words2’表的‘info:word中’,行键为单词
注释都写在程序中就不做解释了
/**
* 使用mr操作hbase来进行单词统计
* @author 12706
*
*/
public class MrMachineHBase {
//创建hbase配置对象
static Configuration config = null;
static{
config = HBaseConfiguration.create();
//设置参数
config.set("hbase.zookeeper.quorum", "mini1,mini2,mini3");
config.set("hbase.zookeeper.property.clientPort", "2181");
}
//创建表信息
public static final String TABLE_NAME = "words";
public static final String TABLE_NAME_ = "words2";
public static final String FAMILY = "info";
public static final String COLUMN = "word";
//初始化表
public static void init(){
HTable table = null;
HBaseAdmin admin = null;
try{
admin = new HBaseAdmin(config);
//判断表是否存在
if(admin.tableExists(TABLE_NAME)){
//删除表
admin.disableTable(TABLE_NAME);
admin.deleteTable(TABLE_NAME);
}
if(admin.tableExists(TABLE_NAME_)){
//删除表
admin.disableTable(TABLE_NAME_);
admin.deleteTable(TABLE_NAME_);
}
//创建表描述类
TableName tableName = TableName.valueOf(TABLE_NAME);
HTableDescriptor tableDescriptor = new HTableDescriptor(tableName);
//创建列族描述类
HColumnDescriptor columnDescriptor = new HColumnDescriptor(FAMILY);
//列族描述类加入到表描述类中
tableDescriptor.addFamily(columnDescriptor);
//创建表
admin.createTable(tableDescriptor);
TableName tableName2 = TableName.valueOf(TABLE_NAME_);
HTableDescriptor tableDescriptor2 = new HTableDescriptor(tableName2);
//创建列族描述类
HColumnDescriptor columnDescriptor2 = new HColumnDescriptor(FAMILY);
//列族描述类加入到表描述类中
tableDescriptor2.addFamily(columnDescriptor2);
//创建表
admin.createTable(tableDescriptor2);
table = new HTable(config, TABLE_NAME);
//自动刷出
table.setAutoFlush(false);
//一次缓存大小
table.setWriteBufferSize(1000);
/*
* 向表中插入数据
*/
List<Put> putList = new ArrayList<>();
Put put = new Put(Bytes.toBytes("1"));
put.add(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUMN),
Bytes.toBytes("hadoop,hdfs,mapreduce,hive,hbase"));
putList.add(put);
Put put2 = new Put(Bytes.toBytes("2"));
put2.add(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUMN),
Bytes.toBytes("hdfs,hive,hbase,storm,kafka"));
putList.add(put2);
Put put3 = new Put(Bytes.toBytes("3"));
put3.add(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUMN),
Bytes.toBytes("hbase,storm,kafka,spark,mahout"));
putList.add(put3);
table.put(putList);
//提交
table.flushCommits();
}catch(Exception e){
e.printStackTrace();
}finally {
if(table!=null){
try {
table.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/**
* Text 输出key类型
* IntWritable 输出value类型
* 一次读取一行数据(一个rowkey对应一行)
* @author 12706
*
*/
static class WordCountMapper extends TableMapper<Text, IntWritable>{
static Text k = new Text();
static IntWritable v = new IntWritable(1);
//key:rowkey value:对应的一行的result
@Override
protected void map(ImmutableBytesWritable key, Result value,
Context context)throws IOException, InterruptedException {
//获取info:word的value值
//如:hadoop,hdfs,mapreduce,hive,hbase
byte[] line = value.getValue(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUMN));
//按逗号切割hadoop hdfs mapreduce hive hbase
String[] wordz = Bytes.toString(line).split(",");
//循环输出word和1
for (String w : wordz) {
k.set(w);
//写出
context.write(k, v);
}
}
}
/**
* Text:传入的key类型
* IntWritable:传入的value类型
* ImmutableBytesWritable:输出类型,rowkey类型
* @author 12706
*
*/
static class WordCountReducer extends TableReducer<Text, IntWritable, ImmutableBytesWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Context context)throws IOException, InterruptedException {
int count = 0;
for (IntWritable intWritable : values) {
count += intWritable.get();
}
//设置rowkey为单词
Put put = new Put(Bytes.toBytes(key.toString()));
put.add(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUMN), Bytes.toBytes(String.valueOf(count)));
//写到hbase,需要指定rowkey、put
context.write(new ImmutableBytesWritable(Bytes.toBytes(key.toString())), put);
}
}
public static void main(String[] args) throws Exception {
//初始化表
init();
//创建job
Job job = Job.getInstance(config);//job
job.setJarByClass(MrMachineHBase.class);//主类
//创建scan
Scan scan = new Scan();
//可以指定查询某一列
scan.addColumn(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUMN));
//创建查询hbase的mapper,设置表名、scan、mapper类、mapper的输出key、mapper的输出value
TableMapReduceUtil.initTableMapperJob(TABLE_NAME, scan, WordCountMapper.class,Text.class, IntWritable.class, job);
//创建写入hbase的reducer,指定表名、reducer类、job reduce不设置输出默认跟mapper的输出一致
TableMapReduceUtil.initTableReducerJob(TABLE_NAME_, WordCountReducer.class, job);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
将工程打包上传到hadoop集群
开始测试
[root@mini1 ~]# hadoop jar hbase.jar com.scu.hbase.MrMachineHBase
执行完之后,hbase客户端命令行查看
hbase(main):011:0> list
TABLE
user1
words
words2
3 row(s) in 0.0550 seconds
=> ["user1", "words", "words2"]
hbase(main):012:0> scan 'words'
ROW COLUMN+CELL
1 column=info:word, timestamp=1509345735252, value=hadoop,hdfs,mapreduce,hive,hbase
2 column=info:word, timestamp=1509345735252, value=hdfs,hive,hbase,storm,kafka
3 column=info:word, timestamp=1509345735252, value=hbase,storm,kafka,spark,mahout
3 row(s) in 0.1610 seconds
hbase(main):013:0> scan 'words2'
ROW COLUMN+CELL
hadoop column=info:word, timestamp=1509345777813, value=1
hbase column=info:word, timestamp=1509345777813, value=3
hdfs column=info:word, timestamp=1509345777813, value=2
hive column=info:word, timestamp=1509345777813, value=2
kafka column=info:word, timestamp=1509345777813, value=2
mahout column=info:word, timestamp=1509345777813, value=1
mapreduce column=info:word, timestamp=1509345777813, value=1
spark column=info:word, timestamp=1509345777813, value=1
storm column=info:word, timestamp=1509345777813, value=2
9 row(s) in 0.0860 seconds