MapReduce原理和程序

最新推荐文章于 2022-01-16 16:51:18 发布

尚能饭否

最新推荐文章于 2022-01-16 16:51:18 发布

阅读量435

点赞数

分类专栏：大数据文章标签： hadoop mapreduce

本文链接：https://blog.csdn.net/shangwei1991/article/details/51191433

版权

大数据专栏收录该内容

7 篇文章 0 订阅

订阅专栏

1 MapReduce原理图

2 WordCount程序

package captain.hadoop.mr;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCount {

	// 继承Mapper抽象类，注意Mapper的泛型类型，为hadoop之中的类型。
	public static class WordCountMapper extends
			Mapper<LongWritable, Text, Text, LongWritable> {
		/**
		 * 每读取一行数据就会执行一次map函数。
		 * 
		 * @param key
		 *            表示行首字节在源文件中的偏移量
		 * @param value
		 *            表示行文本的内容
		 * */
		@Override
		protected void map(LongWritable key, Text value,
				Mapper<LongWritable, Text, Text, LongWritable>.Context context)
				throws IOException, InterruptedException {
			String[] splited = value.toString().split("\\s");
			for (String word : splited) {
				// 将读到的单词和次数1作为键值对写到上下文中。
				context.write(new Text(word), new LongWritable(1));
			}
		}
	}

	// 产生中间输出：<hello,1><captain,1><hello,1><kellie,1>...
	// 自动排序：<captain,1><hello,1><hello,1><kellie,1>...
	// 分组：<captain,{1}> <hello,{1,1}> <kellie,{1}>...

	// 继承Reducer抽象类，注意Reducer的泛型类型，为hadoop之中的类型,且其对应map函数的输出类型。
	public static class WordCountReducer extends
			Reducer<Text, LongWritable, Text, LongWritable> {
		/**
		 * 每个分组会执行一次reduce函数
		 * 
		 * @param word
		 *            表示单词
		 * @param times
		 *            表示同一个单词的次数迭代器，如hello对应的是{1,1}
		 * */
		@Override
		protected void reduce(Text word, Iterable<LongWritable> times,
				Reducer<Text, LongWritable, Text, LongWritable>.Context context)
				throws IOException, InterruptedException {
			long sum = 0L;
			for (LongWritable longWritable : times) {
				sum += longWritable.get();
			}
			context.write(word, new LongWritable(sum));
		}
	}

	public static void main(String[] args) throws Exception {
		// 创建job并打包类的代码
		Job job = Job.getInstance(new Configuration());
		job.setJarByClass(WordCount.class);

		// 这行代码可以不写。它的作用是处理map端的输入数据格式，默认就是TextInputFormat。
		job.setInputFormatClass(TextInputFormat.class);

		// 指定Mapper类
		job.setMapperClass(WordCountMapper.class);
		/*
		 * 可选。可以使用combiner，即可以直接在一个map端合并该map的输出结果，但不能合并所有map端总的结果，因此并不能代替reduce端
		 * 。 好处：可以减少中间结果数据的shuffle网络传输，加快执行速度，降低shuffle传输的出错率。
		 */
		job.setCombinerClass(WordCountReducer.class);
		// 可选。设置reduce的个数，并指定任务的分配规则(分配到哪个reduce)。
		job.setNumReduceTasks(2);
		job.setPartitionerClass(MyPartitioner.class);
		// 指定Reducer类
		job.setReducerClass(WordCountReducer.class);

		// 指定输出的键值类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);

		// 指定输入输出路径
		FileInputFormat.setInputPaths(job, "/words");
		FileOutputFormat.setOutputPath(job, new Path("/wordcount_out"));
		
		//提交作业
		job.waitForCompletion(true);
	}

	// 自定义一个MyPartitioner类，继承于抽象类Partitioner。覆盖其getPartitioner方法。用于指定任务分配规则(分配到哪个reduce)。
	public static class MyPartitioner extends Partitioner<Text, LongWritable> {
		@Override
		public int getPartition(Text key, LongWritable value, int numPartitions) {
			String word = key.toString();
			if ("hello".equals(word)) {
				return 0;
			} else {
				return 1;
			}
		}
	}
}

3 FriendsRecommended程序

package captain.hadoop.mr;

import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class FriendsRecommended {
	
	// 继承Mapper抽象类，注意Mapper的泛型类型，为hadoop之中的类型。
	public static class FriendsRecommendedMapper extends Mapper<LongWritable, Text, Text, Text> {
		/**
		 * 每读取一行数据就会执行一次map函数。
		 * 
		 * @param key
		 *            表示行首字节在源文件中的偏移量
		 * @param value
		 *            表示行文本的内容，即一个关系，如captain	kellie
		 * */
		@Override
		protected void map(LongWritable key, Text value, Context context) 
				throws IOException, InterruptedException {
			String line = value.toString();
			String[] ss = line.split("\t");
			//将该行中的第一个人作为key，第二个人作为value输出
			context.write(new Text(ss[0]), new Text(ss[1]));
			//将该行中的第二个人作为key，第一个人作为value输出
			context.write(new Text(ss[1]), new Text(ss[0]));
		}
	}
	
	// 继承Reducer抽象类，注意Reducer的泛型类型，为hadoop之中的类型,且其对应map函数的输出类型。
	public static class FriendsRecommendedReducer extends Reducer<Text, Text, Text, Text> {
		@Override
		protected void reduce(Text key, Iterable<Text> i, Context context)
				throws IOException, InterruptedException {
			Set<String> set = new HashSet<String>();
			for(Text t:i){
				//使用HashSet，避免重复
				set.add(t.toString());
			}
			//如果set.size()等于1，则不存在好友推荐
			if(set.size() > 1){
				for(Iterator<String> j= set.iterator();j.hasNext();){
					String name1 = j.next();
					for(Iterator<String> k= set.iterator();k.hasNext();){
						String name2 = k.next();
						if(!name1.equals(name2)){
							//name1与name2之间可以互相推荐
							context.write(new Text(name1), new Text(name2));
						}
					}
				}
			}
		}
	}

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		// 创建job并打包类的代码
		Job job = Job.getInstance(new Configuration());
		job.setJarByClass(FriendsRecommended.class);
		
		// 这行代码可以不写。它的作用是处理map端的输入数据格式，默认就是TextInputFormat。
		job.setInputFormatClass(TextInputFormat.class);

		// 指定Mapper类
		job.setMapperClass(FriendsRecommendedMapper.class);
		// 指定Reducer类
		job.setReducerClass(FriendsRecommendedReducer.class);

		// 指定输出的键值类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);

		// 指定输入输出路径
		FileInputFormat.setInputPaths(job, "/relationship");
		FileOutputFormat.setOutputPath(job, new Path("/FriendsRecommended_out"));

		// 提交作业
		job.waitForCompletion(true);

	}

}