Hadoop新旧API对比初探+链式mapreduce

最新推荐文章于 2022-07-12 19:29:45 发布

ontheway110

最新推荐文章于 2022-07-12 19:29:45 发布

阅读量668

点赞数

分类专栏： hadoop

hadoop 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

1.背景
做项目的时候碰到一个需要用链式mapreduce的问题，然后调研了一下ChainMapper和ChainReducer的使用，顺带缕一下新旧API的异同。
首先需要说明的是：从0.20.0开始，hadoop的API发生了改变，但是旧API依然保留，包名：

org.apache.hadoop.mapred

新版API包名：

org.apache.hadoop.mapreduce

所有hadoop已发布版本的具体文档可以查看这里。

2.新旧版API的异同

以map/reduce定义为例

2.1 map

类名定义

//新版
public static class MyMapper extends Mapper<Object, Text, Text, Text>{}
//旧版
public static class MyMapper extends MapReduceBase implements Mapper<Object, Text, Text, Text>{}

涉及包名

//新版
import org.apache.hadoop.mapreduce.Mapper
//旧版
import org.apache.hadoop.mapred.MapReduceBase
import org.apache.hadoop.mapred.Mapper

方法定义

新版

public void map(Object key, Text value, Context context) throws IOException, InterruptedException {}

旧版

public void map(Object key, Text value, OutputCollector<Text, Text> output, Reporter reporter)throws IOException{}

涉及包名

//新版
import java.io.IOException
//旧版
import java.io.IOException
import org.apache.hadoop.mapred.OutputCollector
import org.apache.hadoop.mapred.Reporter

2.2 reduce

类名定义

//新版
public static MyReducer extends Reducer<Text,Text,Text,Text> {}
//旧版
public static class MyReducer extends MapReduceBase implements Reducer<Text, Text, Text, Text>{}

涉及包名

//新版
import org.apache.hadoop.mapreduce.Reducer//旧版
import org.apache.hadoop.mapred.Reducer

方法定义

//新版
public void reduce(Text key, Iterable<Text> values,Context context) throws IOException, InterruptedException {}
//旧版
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException{}

涉及包名

//新版
与map相同
//旧版
比map多一个包
import java.util.Iterator;

2.3 main方法

新版

Configuration conf = new Configuration();
Job job = new Job(conf, "mapreduce");
job.setJarByClass(MyMapReduce.class);
job.setMapperClass(MyMapper.class);
job.setCombinerClass(MyReducer.class);
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(path1));
FileOutputFormat.setOutputPath(job, new Path(path2));
job.waitForCompletion(true) ;

旧版

JobConf conf = new JobConf(MyMapReduce.class);
conf.setJobName("mapreduce");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(MyMapper.class);
conf.setCombinerClass(MyReducer.class);
conf.setReducerClass(MyReducer.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(path1));
FileOutputFormat.setOutputPath(conf, new Path(path2));
JobClient.runJob(conf);

即新版API中用Job代替了旧版中的JobConf，同时对一些方法也进行了重新定义，具体看下面的包对比

涉及包名

新版
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
旧版
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;

3.链式MapReduce

ChainMapper和ChainReducer在旧版包中，链式mapreduce只允许有一个reduce，但是可以有多个map，包括reduce之前和之后，我在实现时链如下：map->reduce->map

package org.apache.hadoop.examples;

import java.io.IOException;
import java.util.StringTokenizer;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.lib.ChainMapper;
import org.apache.hadoop.mapred.lib.ChainReducer;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;


public class GraphPartition{
	public static class LabelCompareMapper
		extends MapReduceBase implements Mapper<Object, Text, Text, Text>{
		
		public void map(Object key, Text value, OutputCollector<Text, Text> output, Reporter reporter)
			throws IOException{
			String line = value.toString();
			
			if(line.substring(0,1).matches("[0-9]{1}"))
			{
				String[] values = line.split("\t");
				String[] heads = values[0].split(" ");
				if(values[1].contains("_"))
				{
					values[1] = values[1].replace("_","");
					String[] tails = values[1].split(" ");
					String src_ver = heads[0];
					String label = heads[1];
					for(int i=0;i<tails.length;i++)
					{
						output.collect(new Text(tails[i]),new Text(src_ver+"_"+label));  //<2	1_1>
					}
				}
				output.collect(new Text(heads[0]),new Text(heads[1]));   //<2	2>
			}
		}
	}
	
	public static class LabelCompareReducer
		extends MapReduceBase implements Reducer<Text, Text, Text, Text>{
	
		private Text result = new Text();
		public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter)
			throws IOException{
			String head = "";
			String tail = "";
			while(values.hasNext())
			{
				String val = values.next().toString();
				if(val.contains("_"))
				{
					tail = tail+val+" ";
				}
				else
				{
					head = val;
				}
			}
			if(tail.contains("_"))
			{
				result.set(head+" "+tail);
				output.collect(key,result);  //<2	2 1_1>
			}
		}
	}
	
	public static class GraphPartitionMapper
		extends MapReduceBase implements Mapper<Text, Text,Text,Text>{
		
		public void map(Text key, Text value, OutputCollector<Text, Text> output, Reporter reporter)
			throws IOException{
			String ver = key.toString();
			String tail = value.toString();
			String[] sps = tail.split(" ");
			String ver_lab = sps[0];
			for(int i=0;i<sps.length;i++)
			{	
				if(sps[i].length()>=1)
				{
					if(sps[i].contains("_"))
					{
						String[] blocks = sps[i].split("_");
						if(blocks[1]==ver_lab)
						{
							output.collect(new Text(blocks[0]+" "+ver+" "+blocks[1]),new Text());
						}
						else
						{
							output.collect(new Text(blocks[0]+" "+ver+" "+ver_lab),new Text());
						}
					}
					else
					{
						ver_lab = sps[i];
					}
				}
			}
		}
	}
	

	public static void main(String[] args) throws Exception{
		String path1="lbp/input";
		String path2="lbp/out1";
		JobConf job = new JobConf(GraphPartition.class);
		job.setJobName("ChainJob");
		job.setInputFormat(TextInputFormat.class);
		job.setOutputFormat(TextOutputFormat.class);
		
		JobConf labelcomparemapperconf = new JobConf(false);
		ChainMapper.addMapper(job,LabelCompareMapper.class,Object.class,Text.class,Text.class,Text.class, true,labelcomparemapperconf);
	
		JobConf labelcomparereducerconf = new JobConf(false);
		ChainReducer.setReducer(job,LabelCompareReducer.class,Text.class,Text.class, Text.class,Text.class,true,labelcomparereducerconf);

		JobConf graphpartitionmapperconf = new JobConf(false);
		ChainReducer.addMapper(job,GraphPartitionMapper.class,Text.class,Text.class,Text.class,Text.class,true,graphpartitionmapperconf);

		job.setJarByClass(GraphPartition.class);
		job.setNumReduceTasks(1);
		job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
		FileInputFormat.setInputPaths(job, new Path(path1));
        FileOutputFormat.setOutputPath(job, new Path(path2));
		JobClient.runJob(job);
	}
}

ontheway110

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Hadoop新旧API对比初探+链式mapreduce

1.背景做项目的时候碰到一个需要用链式mapreduce的问题，然后调研了一下ChainMapper和ChainReducer的使用，顺带缕一下新旧API的异同。首先需要说明的是：从0.20.0开始，hadoop的API发生了改变，但是旧API依然保留，包名：org.apache.hadoop.mapred新版API包名：org.apache.hadoop.mapreduce
复制链接

扫一扫