hadoop 自学指南四之常见MapReduce案例

最新推荐文章于 2022-02-20 09:52:44 发布

holo_hai

最新推荐文章于 2022-02-20 09:52:44 发布

阅读量591

点赞数

分类专栏： hadoop 自学指南

本文链接：https://blog.csdn.net/ice_grey/article/details/48441699

版权

hadoop 自学指南专栏收录该内容

17 篇文章 0 订阅

订阅专栏

一、前言

介绍MapReduce 一些常见的经典案例

二、去重

思想：利用MapReduce 的shuffle过程，合并相同的key 特性，可实现。

核心：Map ->context.write(new Text(line),new Text("")) ; Reduce->context.write(key,new Text(""))

package hadoop.v5;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import hadoop.utils.HDFSUtils;

/**
 * @author : chenhaipeng
 * @date : 2015年9月6日 上午2:00:50
 */
public class Duplication extends Configured implements Tool {
	
	public static class Map extends Mapper<LongWritable, Text, Text, Text>{
//		private final static IntWritable one = new IntWritable(1);
//		private Text word = new Text("UTF-8");
		/* (non-Javadoc)
		 * @see org.apache.hadoop.mapreduce.Mapper#map(java.lang.Object, java.lang.Object, org.apache.hadoop.mapreduce.Mapper.Context)
		 */
		@Override
		public void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			String line = value.toString();
//			StringTokenizer tokenizer = new StringTokenizer(line);
//			while(tokenizer.hasMoreTokens()){
//				word.set(tokenizer.nextToken());
				context.write(new Text(line), new Text(""));
//			}
		}
		
	}
	
	public static class Reduce extends Reducer<Text, Text, Text, Text>{
		/* (non-Javadoc)
		 * @see org.apache.hadoop.mapreduce.Reducer#reduce(java.lang.Object, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
		 */
		@Override
		public void reduce(Text key, Iterable<Text> values,Context context) throws IOException, InterruptedException {
/* 			int sum = 0; 
			for(IntWritable value : values){
				sum += value.get();
			}
			System.out.println("key----->"+key);*/
			context.write(key, new Text(""));
		}
		
	}

	public static void main(String[] args) throws Exception {
		int ret = ToolRunner.run(new Duplication(), args);
		System.exit(ret);

	}
	
	public static void deletedir(String path){
		try {
			HDFSUtils.DeleteHDFSFile(path);
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	/* 
	 * @see org.apache.hadoop.util.Tool#run(java.lang.String[])
	 */
	@Override
	public int run(String[] args) throws Exception {
		Job job = new Job(getConf());
		job.setJarByClass(Duplication.class);
		job.setJobName("Duplication");
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
	//并不是一定要	
//		job.setInputFormatClass(TextInputFormat.class);
//		job.setOutputFormatClass(TextOutputFormat.class);
		
		job.setMapperClass(Map.class);
		job.setReducerClass(Reduce.class);
		
		job.setCombinerClass(Reduce.class);
		
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		deletedir(args[1]);
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		boolean sucess = job.waitForCompletion(true);
		return sucess == true? 0 : 1;
		
	}

}

三、排序

思想：Map输出hadoop 会自动进行shuffle过程，需要重写comparator和partition

package hadoop.v5;

import java.io.File;
import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import hadoop.utils.EJob;
import hadoop.utils.HDFSUtils;

/**
 * 场景：对刚才统计出的单词数作一个由高到低的排序
 * 思路：Map输出hadoop 会自动进行shuffle过程，需要重写comparator和partition
 * @author : chenhaipeng
 * @date : 2015年9月8日 下午11:58:32
 */
public class SortWordCount {

	public static class SortMap extends Mapper<LongWritable, Text, IntWritable, Text> {
		private Text word = new Text();
		private IntWritable count = new IntWritable();

		/*
		 * 使key--value反转输出
		 * 
		 * @see org.apache.hadoop.mapreduce.Mapper#map(java.lang.Object,
		 * java.lang.Object, org.apache.hadoop.mapreduce.Mapper.Context)
		 */
		@Override
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			String line = value.toString();
			if (StringUtils.isNotEmpty(line)) {
				// 读入每一个数据
				StringTokenizer tokenizer = new StringTokenizer(line);
				while (tokenizer.hasMoreTokens()) {
					word.set(tokenizer.nextToken().trim().toString());
					count.set(Integer.parseInt(tokenizer.nextToken().trim()));
					System.out.println(count+"-->"+word);
					context.write(count, word);
				}
			}

		}
	}

	public static class SortReduce extends Reducer<IntWritable, Text, Text, IntWritable> {
		private Text result = new Text();
		/*
		 * 由于shuffle过程是自带排序的，我们要自定义自己的Partion 和Comparator
		 * 
		 * @see org.apache.hadoop.mapreduce.Reducer#reduce(java.lang.Object,
		 * java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
		 */
		@Override
		protected void reduce(IntWritable key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			for (Text value : values) {
				result.set(value.toString());
				context.write(value, key);
			}

		}

	}
	
	/**
	 * partition 目的是给多个reduce使用的时候，分割数据，有时候你会看到一个reduce 输出为空
	 * @author : chenhaipeng
	 * @date : 2015年9月9日 上午12:38:54
	 */
	public static class SortPartition extends Partitioner<IntWritable, Text>{

		/* 
		 * @see org.apache.hadoop.mapreduce.Partitioner#getPartition(java.lang.Object, java.lang.Object, int)
		 */
		@Override
		public int getPartition(IntWritable key, Text value, int numPartitions) {
			//以数值大于2为一个分区
			int maxValue = 2;
			int keySection = 0; 
			//只有key 大于 maxValue 并且numPartitions 的大于2的时候才需要分区，否则直接返回0
			if(numPartitions > 1 && key.hashCode() < maxValue){
				int sectionValue = maxValue / (numPartitions -1);
				int count = 0; 
				while(key.hashCode() - sectionValue * count > sectionValue){
					count++; 
				}
				keySection = numPartitions -1 - count;
			}
			return keySection;
		}
		
	}
	
	public static class SortComparator extends WritableComparator{

		/**
		 * @param keyClass
		 */
		protected SortComparator() {
			super(IntWritable.class,true);
		}

		/* 降序
		 * @see org.apache.hadoop.io.WritableComparator#compare(org.apache.hadoop.io.WritableComparable, org.apache.hadoop.io.WritableComparable)
		 */
		@Override
		public int compare(WritableComparable a, WritableComparable b) {
			return -super.compare(a, b);
		}
		
	}
	public static void deletedir(String path){
		try {
			HDFSUtils.DeleteHDFSFile(path);
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	public static void main(String[] args) throws Exception {
		//-
		File jarFile = EJob.createTempJar("bin");
//		EJob.addClasspath("D:/hadoop-1.2.1/conf/conf");
		ClassLoader classLoader = EJob.getClassLoader();
		Thread.currentThread().setContextClassLoader(classLoader);
		
		Configuration conf = new Configuration();
		String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
		if (otherArgs.length != 2) {
			System.err.println("Usage: SortWordCount <in> <out>");
			System.exit(2);
		}
		//运行在真正的集群上
		conf.set("mapred.job.tracker", "192.168.100.150:9001");
		
		Job job = new Job(conf);
		job.setJobName("SortWordCount");
		job.setJarByClass(SortWordCount.class);
		
		job.setOutputKeyClass(IntWritable.class);
		job.setOutputValueClass(Text.class);
		
		job.setMapperClass(SortMap.class);
		job.setReducerClass(SortReduce.class);
		job.setPartitionerClass(SortPartition.class);
		job.setSortComparatorClass(SortComparator.class);
		job.setNumReduceTasks(2);
		
		//-
		((JobConf) job.getConfiguration()).setJar(jarFile.toString());
		
		
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		deletedir(args[1]);
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		System.exit(job.waitForCompletion(true)? 0 : 1);
		
		

	}

}

四、单表关联

思想：表与表之间的自连接思路：在hadoop shuffle过程的时候会进行合并，左表parent 为key 右表child 为key

package hadoop.v5;

import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import hadoop.utils.HDFSUtils;
import hadoop.v5.Sort.IntKeyDescComparator;
import hadoop.v5.Sort.KeySectionPartitioner;
import hadoop.v5.Sort.Map;
import hadoop.v5.Sort.Reduce;

/**
 * 表与表之间的自连接 思路：在hadoop shuffle过程的时候会进行合并，左表parent 为key 右表child 为key,
 * 这样在shuffle的过程中，并自动合并表一个笛卡尔积
 * 
 * @author : chenhaipeng
 * @date : 2015年9月9日 上午1:48:20
 */
public class SelfJoin {

	public static int time = 0;

	/*
	 * Map 将输入分割成child 和parent ,然后正序输入 一次作为右表，反序输入 一次作为左表,左右表加上标记
	 */
	public static class Map extends Mapper<LongWritable, Text, Text, Text> {

		/*
		 * @see org.apache.hadoop.mapreduce.Mapper#map(java.lang.Object,
		 * java.lang.Object, org.apache.hadoop.mapreduce.Mapper.Context)
		 */
		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			String line = value.toString();
			if (StringUtils.isNotEmpty(line)) {
				String childName = new String();
				String parentName = new String();
				String relationType = new String();
				System.out.println(line);
				int i = 0;
				// 找出字符隔的位置
				while (line.charAt(i) != '\t') {
					i++;
				}
				String[] values = { line.substring(0, i), line.substring(i + 1) };
				// 开头的表头不处理
				if (values[0].compareTo("child") != 0) {
					childName = values[0].trim();
					parentName = values[1].trim();
					// 左右表标记
					relationType = "1";
					context.write(new Text(parentName), new Text(relationType + "+" + childName + "+" + parentName));
					relationType = "2";
					// 这样左右表在shuffle 过程中合并 在一起
					context.write(new Text(childName), new Text(relationType + "+" + childName + "+" + parentName));
				}
			}

		}

	}

	public static class Reduce extends Reducer<Text, Text, Text, Text> {

		/*
		 * @see org.apache.hadoop.mapreduce.Reducer#reduce(java.lang.Object,
		 * java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
		 */
		@Override
		protected void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			if (time == 0) {
				context.write(new Text("grandChild"), new Text("grandParent"));
				time++;
			}
			int grandChildNum = 0;
			String[] grandChild = new String[10];
			int grandParentNum = 0;
			String[] grandParent = new String[10];
			Iterator iter = values.iterator();
			while (iter.hasNext()) {
				String record = iter.next().toString();
//				System.out.println(record);
				int len = record.length();
				int i = 2;
				if (len == 0)
					continue;
				char relationType = record.charAt(0);
				String childname = new String();
				String parnetname = new String();
				while (record.charAt(i) != '+') {
					i++;
				}
				childname = record.substring(2, i);
				parnetname = record.substring(i + 1);
//				System.out.println("childname-->"+childname);
//				System.out.println("parnetname-->"+parnetname);

				// 左表
				if (relationType == '1') {
					grandChild[grandChildNum] = childname;
					grandChildNum++;
				} else {
					grandParent[grandParentNum] = parnetname;
					grandParentNum++;
				}
			}
//			System.out.println(Arrays.asList(grandChild));
//			System.out.println(Arrays.asList(grandParent));
			// 求数组的笛卡尔积
			if (grandParentNum != 0 && grandChildNum != 0) {
				for (int m = 0; m < grandChildNum; m++) {
					for (int n = 0; n < grandParentNum; n++) {
						context.write(new Text(grandChild[m].trim()),new Text(grandParent[n].trim()));
					}
				}
			}
		}

	}

	public static void deletedir(String path) {
		try {
			HDFSUtils.DeleteHDFSFile(path);
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	public static void main(String[] args) throws Exception {

		Configuration conf = new Configuration();
		String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
		if (otherArgs.length != 2) {
			System.err.println("Usage: SelfJon <in> <out>");
			System.exit(2);
		}
		Job job = new Job(conf);
		job.setJarByClass(SelfJoin.class);
		job.setJobName("SelfJoin");
		job.setMapperClass(Map.class);
		job.setReducerClass(Reduce.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
		deletedir(args[1]);
		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);

	}

}

输入为：

child	parent
Tom		Lucy
Tom		Jack
Jone	Lucy
Lucy	Marry
Lucy	Ben
Jack	Alice
Jack	Jesse
Terry	Alice
Terry	Jesse
Philip	Terry
Philip	Alma
Mark	Terry
Mark	Alma

五、多表关联

思想：与单表关联相似，要提前指定好那个是左表，那个是右表

核心：

			//找出数据的分割点
			while(line.charAt(i) >= '9' || line.charAt(i) <= '0'){
				i++;
			}
			
			//代表的是左表
			if(line.charAt(0)>= '9' || line.charAt(0) <= '0'){
				int j = i-1; 
				while(line.charAt(j) != ' ')j--;
				String[] values = {line.substring(0,j),line.substring(i)};
				context.write(new Text(values[1].trim()), new Text("1+"+values[0]));
			}else{
				int j = i+1;
				while(line.charAt(j) != ' ')j++;
				String[] values = {line.substring(0,i+1),line.substring(j)};
				context.write(new Text(values[0].trim()), new Text("2+"+values[1]));
			}

代码：

package hadoop.v5;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import hadoop.utils.HDFSUtils;

/**
 * 多表连接，类似自连接
 * @author : chenhaipeng
 * @date : 2015年9月10日 上午12:33:26
 */
public class MTJoin {
	
	public static int time = 0; 
	
	public static class Map extends Mapper<LongWritable, Text, Text, Text>{

		/* 
		 * @see org.apache.hadoop.mapreduce.Mapper#map(java.lang.Object, java.lang.Object, org.apache.hadoop.mapreduce.Mapper.Context)
		 */
		@Override
		protected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {
			String line = value.toString();
			int i = 0; 
			if(line.contains("factoryname") == true || line.contains("addressID") == true){
				return;
			}
			//找出数据的分割点
			while(line.charAt(i) >= '9' || line.charAt(i) <= '0'){
				i++;
			}
			
			//代表的是左表
			if(line.charAt(0)>= '9' || line.charAt(0) <= '0'){
				int j = i-1; 
				while(line.charAt(j) != ' ')j--;
				String[] values = {line.substring(0,j),line.substring(i)};
				context.write(new Text(values[1].trim()), new Text("1+"+values[0]));
			}else{
				int j = i+1;
				while(line.charAt(j) != ' ')j++;
				String[] values = {line.substring(0,i+1),line.substring(j)};
				context.write(new Text(values[0].trim()), new Text("2+"+values[1]));
			}
			
			
		}
		
	}
	
	public static class Reduce extends Reducer<Text, Text, Text, Text>{

		/* 
		 * @see org.apache.hadoop.mapreduce.Reducer#reduce(java.lang.Object, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
		 */
		@Override
		protected void reduce(Text text, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			if(time == 0){
				context.write(new Text("factoryname"), new Text("addressname"));
				time++;
			}
			int factorynum = 0; 
			String factory[] = new String[10];
			int addressnum  = 0; 
			String address[] = new String[10];
			Iterator iter = values.iterator();
			while(iter.hasNext()){
				String record = iter.next().toString();
				int len = record.length();
				int i = 2;
				char type = record.charAt(0);
				String factoryname = new String();
				String addressname = new String();
				if(type == '1' ){	//左表
					factory[factorynum] = record.substring(2);	
					factorynum++;
				}else{		//右表
					address[addressnum] = record.substring(2);
					addressnum++;
				}
			}
			//
			if(factorynum != 0 && addressnum!= 0){
				for(int m = 0; m < factorynum; m++){
					for(int n = 0; n < addressnum; n++){
						context.write(new Text(factory[m]), new Text(address[n]));
					}
				}
			}
			
		
		}
		
	}
	public static void deletedir(String path) {
		try {
			HDFSUtils.DeleteHDFSFile(path);
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
		if (otherArgs.length != 2) {
			System.err.println("Usage: MTJoin <in> <out>");
			System.exit(2);
		}
		Job job = new Job(conf);
		job.setJarByClass(MTJoin.class);
		job.setJobName("MTJoin");
		job.setMapperClass(Map.class);
		job.setReducerClass(Reduce.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
		deletedir(args[1]);
		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);

	}

}

输入为：

factoryname addressed
Beijing	Red Star 1

addressID addressname
1 Beijing

holo_hai

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录