MapRedece操作HBase(2)

从HDFS读取数据MR操作后写入HBase

1、WordCount示例

package mapred;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

public class WordCountHBase {

	public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
		private IntWritable i = new IntWritable(1);
		public void map(LongWritable key, Text value, Context context) 
                              throws IOException, InterruptedException {
			String s[] = value.toString().trim().split(" ");
			for (String m: s) {
				context.write(new Text(m), i);
			}
		}
	}
	
	public static class Reduce extends TableReducer<Text, IntWritable, NullWritable> {
		public void reduce(Text key, Iterable<IntWritable> values, Context context) 
                               throws IOException, InterruptedException {
			int sum = 0;
			for(IntWritable i : values) {
				sum += i.get();
			}
			Put put = new Put(Bytes.toBytes(key.toString()));
			put.add(Bytes.toBytes("content"), Bytes.toBytes("count"),
                                Bytes.toBytes(String.valueOf(sum)));
			
			context.write(NullWritable.get(), put);
		}
	}
	
	/**
	 * @param args
	 * @throws Exception 
	 */
	public static void main(String[] args) throws Exception {
		String tableName = "wordcount";
		Configuration config = HBaseConfiguration.create();
		config.set("hbase.zookeeper.property.clientPort", "2181");
		config.set("hbase.zookeeper.quorum", "10.10.4.55");
		config.set(TableOutputFormat.OUTPUT_TABLE, tableName);
		
		HTableDescriptor htd = new HTableDescriptor(tableName);
		HColumnDescriptor col = new HColumnDescriptor("content");
		htd.addFamily(col);
		
		HBaseAdmin admin = new HBaseAdmin(config);
		if(admin.tableExists(tableName)) {
			System.out.println("table exists, trying recreate table!");
			admin.disableTable(tableName);
			admin.deleteTable(tableName);
		}
		System.out.println("create new table : " + tableName);
		admin.createTable(htd);
		admin.close();
		
		String input = "hdfs://10.10.4.55:9000/user/root/t.txt";
		Job job = new Job(config, "WordCount table with " + input);
		job.setJarByClass(WordCountHBase.class);
		
		job.setMapperClass(Map.class);
		job.setReducerClass(Reduce.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		
		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TableOutputFormat.class);
		
		FileInputFormat.addInputPath(job, new Path(input));
		System.exit(job.waitForCompletion(true)?0:1);
	}

}

输入文件内容:

hello world hello hadoop
bye world bye hadoop

输出表数据:

hbase(main):035:0> scan 'wordcount'
ROW       COLUMN+CELL
bye      column=content:count, timestamp=1378280133721, value=2
hadoop  column=content:count, timestamp=1378280133721, value=2
hello   column=content:count, timestamp=1378280133721, value=2
world   column=content:count, timestamp=1378280133721, value=2
4 row(s) in 0.0890 seconds

[root@master hbase0941]#

2、应用服务器日志分析timespend示例

有内容如下日志文件,要分析每位用户花费的总时间。

01/01/2011 user1 3s
01/01/2011 user2 5s
01/01/2011 user3 10s
01/01/2011 user4 6s
01/02/2011 user1 4s
01/02/2011 user2 30s
01/02/2011 user3 4s
01/02/2011 user1 15s
01/02/2011 user2 15s

代码如下,首先是Mapper:

package mapred.t1;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class THMapper extends Mapper<LongWritable, Text, Text, Text> {
	public void map(LongWritable key, Text value, Context context) {
		String[] items = value.toString().split("\\W+");
		String k = items[3];
		String v = items[4];
		System.out.println("key:" + k + "," + "value:" + v);
		try {
			context.write(new Text(k), new Text(v));
		} catch (IOException e) {
			e.printStackTrace();
		} catch (InterruptedException e) {
			e.printStackTrace();
		}
	}

}

然后是Reduce:

package mapred.t1;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.io.Text;

public class THReducer extends TableReducer<Text, Text, ImmutableBytesWritable> {
	public void reduce(Text key, Iterable<Text> values, Context context) {
		String k = key.toString();
		int num = 0;
		Iterator<Text> it = values.iterator();
		String timeStr;
		while(it.hasNext()) {
			timeStr = it.next().toString();
			num += Integer.parseInt(timeStr.substring(0, timeStr.length()-1));			
		}
		Put putrow = new Put(k.getBytes());
		putrow.add("spend".getBytes(), "time".getBytes(), 
                                     Integer.toString(num).getBytes());
		
		try {
			context.write(new ImmutableBytesWritable(key.getBytes()), putrow);
		} catch (IOException e) {
			e.printStackTrace();
		} catch (InterruptedException e) {
			e.printStackTrace();
		}

	}

}

然后是Driver:

package mapred.t1;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.Tool;

public class THDriver extends Configured implements Tool {

	@Override
	public int run(String[] arg0) throws Exception {
		Configuration conf = HBaseConfiguration.create();
		conf.set("hbase.zookeeper.quorum.", "10.10.4.55");

		Job job = new Job(conf, "Txt-to-Hbase");
		job.setJarByClass(TxtHbase.class);
		
		/**
		 * 事先创建表结构
		 */
		String tableName = "timespend";
		HTableDescriptor htd = new HTableDescriptor(tableName);
		HColumnDescriptor col = new HColumnDescriptor("spend");
		htd.addFamily(col);
		
		HBaseAdmin admin = new HBaseAdmin(conf);
		if(admin.tableExists(tableName)) {
			System.out.println("table exists, trying recreate table!");
			admin.disableTable(tableName);
			admin.deleteTable(tableName);
		}
		System.out.println("create new table : " + tableName);
		admin.createTable(htd);
		admin.close();

		
		Path in = new Path("hdfs://10.10.4.55:9000/user/root/t1.txt");

		job.setInputFormatClass(TextInputFormat.class);
		FileInputFormat.addInputPath(job, in);

		job.setMapperClass(THMapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);

		TableMapReduceUtil.initTableReducerJob(tableName, 
                           THReducer.class, job);

		job.waitForCompletion(true);
		return 0;
	}

}

最后是主类:

package mapred.t1;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.ToolRunner;

public class TxtHbase {
    public static void main(String [] args) throws Exception{
        int mr;
        mr = ToolRunner.run(new Configuration(),new THDriver(),args);
        System.exit(mr);
    }
}

通过以上代码,mapreduce实现之后,在hbase的shell中查看结果表timespend如下:

hbase(main):002:0> scan 'timespend'
ROW           COLUMN+CELL    
 user1       column=spend:time, timestamp=1378282924060, value=22
 user2       column=spend:time, timestamp=1378282924060, value=50
 user3       column=spend:time, timestamp=1378282924060, value=14 
 user4       column=spend:time, timestamp=1378282924060, value=6 
4 row(s) in 0.2650 seconds
hbase(main):003:0> 


Reduce中要把处理之后的结果写入hbase的表中,所以与普通的mapreduce程序有些区别,由以上代码可以知道,reduce类继承的是TableReducer,通过查询API(如下图1)知道,它继承Reducer类,与其他的reduce类一样,它的输入k/v对是对应Map的输出k/v对,它的输出key可以是任意的类型,但是value必须是一个put或delete实例。

                             

图1:TableReducer类详解  

Reduce的输出key是ImmutableWritable类型(org.apache.hadoop.hase.io),API 中的解释,它是一个可以用作key或value类型的字节序列,该类型基于BytesWritable,不能调整大小。Reduce的输出value是一个put。如下面代码:  

context.write(newImmutableBytesWritable(key.getBytes()),putrow);


Driver中job配置的时候没有设置job.setReduceClass(); 而是用 TableMapReduceUtil.initTableReducerJob(tablename,THReducer.class,job);来执行reduce类。


TableMapReduceUtil类(org.apache.hadoop.hbase.mapreduce):a utility for TableMapper or TableReducer。因为本例子中的reduce继承的是TableReducer,所以也就解释了用TableMapReduceUtil来执行的原因。该类的方法有:addDependencyJars(),initTableMapperJob(),initTableReducerJob(),limitNumReduceTasks(),setNumReduceTasks()等,具体用法可以查看API。


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值