Hadoop新旧API对比初探+链式mapreduce

1.背景
做项目的时候碰到一个需要用链式mapreduce的问题,然后调研了一下ChainMapper和ChainReducer的使用,顺带缕一下新旧API的异同。
首先需要说明的是:从0.20.0开始,hadoop的API发生了改变,但是旧API依然保留,包名:

org.apache.hadoop.mapred

新版API包名:

org.apache.hadoop.mapreduce

所有hadoop已发布版本的具体文档可以查看这里

2.新旧版API的异同

以map/reduce定义为例

2.1 map

类名定义

//新版

public static class MyMapper extends Mapper<Object, Text, Text, Text>{}

//旧版

public static class MyMapper extends MapReduceBase implements Mapper<Object, Text, Text, Text>{}

涉及包名

//新版

import org.apache.hadoop.mapreduce.Mapper

//旧版

import org.apache.hadoop.mapred.MapReduceBase

import org.apache.hadoop.mapred.Mapper

方法定义

新版

public void map(Object key, Text value, Context context) throws IOException, InterruptedException {}

旧版

public void map(Object key, Text value, OutputCollector<Text, Text> output, Reporter reporter)throws IOException{}

涉及包名

//新版

import java.io.IOException

//旧版

import java.io.IOException import org.apache.hadoop.mapred.OutputCollector import org.apache.hadoop.mapred.Reporter

2.2 reduce

类名定义

//新版

public static MyReducer extends Reducer<Text,Text,Text,Text> {}

//旧版

public static class MyReducer extends MapReduceBase implements Reducer<Text, Text, Text, Text>{}

涉及包名

//新版

import org.apache.hadoop.mapreduce.Reducer

//旧版

import org.apache.hadoop.mapred.Reducer

方法定义

//新版

public void reduce(Text key, Iterable<Text> values,Context context) throws IOException, InterruptedException {}

//旧版

public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException{}

涉及包名

//新版

与map相同

//旧版

map多一个包

import java.util.Iterator;

2.3 main方法

新版

Configuration conf = new Configuration();

Job job = new Job(conf, "mapreduce"); job.setJarByClass(MyMapReduce.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyReducer.class); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(path1)); FileOutputFormat.setOutputPath(job, new Path(path2)); job.waitForCompletion(true) ;

旧版

JobConf conf = new JobConf(MyMapReduce.class);

conf.setJobName("mapreduce");

conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class);

conf.setMapperClass(MyMapper.class);

conf.setCombinerClass(MyReducer.class); conf.setReducerClass(MyReducer.class);

conf.setInputFormat(TextInputFormat.class);

conf.setOutputFormat(TextOutputFormat.class);

FileInputFormat.setInputPaths(conf, new Path(path1));

FileOutputFormat.setOutputPath(conf, new Path(path2));

JobClient.runJob(conf);

即新版API中用Job代替了旧版中的JobConf,同时对一些方法也进行了重新定义,具体看下面的包对比

涉及包名

新版

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

旧版

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobClient;

import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat;

import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.mapred.TextOutputFormat;

3.链式MapReduce

ChainMapper和ChainReducer在旧版包中,链式mapreduce只允许有一个reduce,但是可以有多个map,包括reduce之前和之后,我在实现时链如下:map->reduce->map

package org.apache.hadoop.examples; import java.io.IOException; import java.util.StringTokenizer; import java.util.Iterator; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.lib.ChainMapper; import org.apache.hadoop.mapred.lib.ChainReducer; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.mapred.TextOutputFormat; public class GraphPartition{ public static class LabelCompareMapper extends MapReduceBase implements Mapper<Object, Text, Text, Text>{ public void map(Object key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException{ String line = value.toString(); if(line.substring(0,1).matches("[0-9]{1}")) { String[] values = line.split("\t"); String[] heads = values[0].split(" "); if(values[1].contains("_")) { values[1] = values[1].replace("_",""); String[] tails = values[1].split(" "); String src_ver = heads[0]; String label = heads[1]; for(int i=0;i<tails.length;i++) { output.collect(new Text(tails[i]),new Text(src_ver+"_"+label)); //<2 1_1> } } output.collect(new Text(heads[0]),new Text(heads[1])); //<2 2> } } } public static class LabelCompareReducer extends MapReduceBase implements Reducer<Text, Text, Text, Text>{ private Text result = new Text(); public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException{ String head = ""; String tail = ""; while(values.hasNext()) { String val = values.next().toString(); if(val.contains("_")) { tail = tail+val+" "; } else { head = val; } } if(tail.contains("_")) { result.set(head+" "+tail); output.collect(key,result); //<2 2 1_1> } } } public static class GraphPartitionMapper extends MapReduceBase implements Mapper<Text, Text,Text,Text>{ public void map(Text key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException{ String ver = key.toString(); String tail = value.toString(); String[] sps = tail.split(" "); String ver_lab = sps[0]; for(int i=0;i<sps.length;i++) { if(sps[i].length()>=1) { if(sps[i].contains("_")) { String[] blocks = sps[i].split("_"); if(blocks[1]==ver_lab) { output.collect(new Text(blocks[0]+" "+ver+" "+blocks[1]),new Text()); } else { output.collect(new Text(blocks[0]+" "+ver+" "+ver_lab),new Text()); } } else { ver_lab = sps[i]; } } } } } public static void main(String[] args) throws Exception{ String path1="lbp/input"; String path2="lbp/out1"; JobConf job = new JobConf(GraphPartition.class); job.setJobName("ChainJob"); job.setInputFormat(TextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); JobConf labelcomparemapperconf = new JobConf(false); ChainMapper.addMapper(job,LabelCompareMapper.class,Object.class,Text.class,Text.class,Text.class, true,labelcomparemapperconf); JobConf labelcomparereducerconf = new JobConf(false); ChainReducer.setReducer(job,LabelCompareReducer.class,Text.class,Text.class, Text.class,Text.class,true,labelcomparereducerconf); JobConf graphpartitionmapperconf = new JobConf(false); ChainReducer.addMapper(job,GraphPartitionMapper.class,Text.class,Text.class,Text.class,Text.class,true,graphpartitionmapperconf); job.setJarByClass(GraphPartition.class); job.setNumReduceTasks(1); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(job, new Path(path1)); FileOutputFormat.setOutputPath(job, new Path(path2)); JobClient.runJob(job); } }

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值