hadoop 自学指南四之常见MapReduce案例

一、前言

介绍MapReduce 一些常见的经典案例

二、去重

思想:利用MapReduce 的shuffle过程,合并相同的key 特性,可实现。

核心:Map ->context.write(new Text(line),new Text("")) ;      Reduce->context.write(key,new Text(""))

package hadoop.v5;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import hadoop.utils.HDFSUtils;

/**
 * @author : chenhaipeng
 * @date : 2015年9月6日 上午2:00:50
 */
public class Duplication extends Configured implements Tool {
	
	public static class Map extends Mapper<LongWritable, Text, Text, Text>{
//		private final static IntWritable one = new IntWritable(1);
//		private Text word = new Text("UTF-8");
		/* (non-Javadoc)
		 * @see org.apache.hadoop.mapreduce.Mapper#map(java.lang.Object, java.lang.Object, org.apache.hadoop.mapreduce.Mapper.Context)
		 */
		@Override
		public void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			String line = value.toString();
//			StringTokenizer tokenizer = new StringTokenizer(line);
//			while(tokenizer.hasMoreTokens()){
//				word.set(tokenizer.nextToken());
				context.write(new Text(line), new Text(""));
//			}
		}
		
	}
	
	public static class Reduce extends Reducer<Text, Text, Text, Text>{
		/* (non-Javadoc)
		 * @see org.apache.hadoop.mapreduce.Reducer#reduce(java.lang.Object, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
		 */
		@Override
		public void reduce(Text key, Iterable<Text> values,Context context) throws IOException, InterruptedException {
/* 			int sum = 0; 
			for(IntWritable value : values){
				sum += value.get();
			}
			System.out.println("key----->"+key);*/
			context.write(key, new Text(""));
		}
		
	}

	public static void main(String[] args) throws Exception {
		int ret = ToolRunner.run(new Duplication(), args);
		System.exit(ret);

	}
	
	public static void deletedir(String path){
		try {
			HDFSUtils.DeleteHDFSFile(path);
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	/* 
	 * @see org.apache.hadoop.util.Tool#run(java.lang.String[])
	 */
	@Override
	public int run(String[] args) throws Exception {
		Job job = new Job(getConf());
		job.setJarByClass(Duplication.class);
		job.setJobName("Duplication");
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
	//并不是一定要	
//		job.setInputFormatClass(TextInputFormat.class);
//		job.setOutputFormatClass(TextOutputFormat.class);
		
		job.setMapperClass(Map.class);
		job.setReducerClass(Reduce.class);
		
		job.setCombinerClass(Reduce.class);
		
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		deletedir(args[1]);
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		boolean sucess = job.waitForCompletion(true);
		return sucess == true? 0 : 1;
		
	}

}

三、排序

思想:Map输出hadoop 会自动进行shuffle过程,需要重写comparator和partition

package hadoop.v5;

import java.io.File;
import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import hadoop.utils.EJob;
import hadoop.utils.HDFSUtils;

/**
 * 场景:对刚才统计出的单词数作一个由高到低的排序
 * 思路:Map输出hadoop 会自动进行shuffle过程,需要重写comparator和partition
 * @author : chenhaipeng
 * @date : 2015年9月8日 下午11:58:32
 */
public class SortWordCount {

	public static class SortMap extends Mapper<LongWritable, Text, IntWritable, Text> {
		private Text word = new Text();
		private IntWritable count = new IntWritable();

		/*
		 * 使key--value反转输出
		 * 
		 * @see org.apache.hadoop.mapreduce.Mapper#map(java.lang.Object,
		 * java.lang.Object, org.apache.hadoop.mapreduce.Mapper.Context)
		 */
		@Override
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			String line = value.toString();
			if (StringUtils.isNotEmpty(line)) {
				// 读入每一个数据
				StringTokenizer tokenizer = new StringTokenizer(line);
				while (tokenizer.hasMoreTokens()) {
					word.set(tokenizer.nextToken().trim().toString());
					count.set(Integer.parseInt(tokenizer.nextToken().trim()));
					System.out.println(count+"-->"+word);
					context.write(count, word);
				}
			}

		}
	}

	public static class SortReduce extends Reducer<IntWritable, Text, Text, IntWritable> {
		private Text result = new Text();
		/*
		 * 由于shuffle过程是自带排序的,我们要自定义自己的Partion 和Comparator
		 * 
		 * @see org.apache.hadoop.mapreduce.Reducer#reduce(java.lang.Object,
		 * java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
		 */
		@Override
		protected void reduce(IntWritable key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			for (Text value : values) {
				result.set(value.toString());
				context.write(value, key);
			}

		}

	}
	
	/**
	 * partition 目的是给多个reduce使用的时候,分割数据,有时候你会看到一个reduce 输出为空
	 * @author : chenhaipeng
	 * @date : 2015年9月9日 上午12:38:54
	 */
	public static class SortPartition extends Partitioner<IntWritable, Text>{

		/* 
		 * @see org.apache.hadoop.mapreduce.Partitioner#getPartition(java.lang.Object, java.lang.Object, int)
		 */
		@Override
		public int getPartition(IntWritable key, Text value, int numPartitions) {
			//以数值大于2为一个分区
			int maxValue = 2;
			int keySection = 0; 
			//只有key 大于 maxValue 并且numPartitions 的大于2的时候才需要分区,否则直接返回0
			if(numPartitions > 1 && key.hashCode() < maxValue){
				int sectionValue = maxValue / (numPartitions -1);
				int count = 0; 
				while(key.hashCode() - sectionValue * count > sectionValue){
					count++; 
				}
				keySection = numPartitions -1 - count;
			}
			return keySection;
		}
		
	}
	
	public static class SortComparator extends WritableComparator{

		/**
		 * @param keyClass
		 */
		protected SortComparator() {
			super(IntWritable.class,true);
		}

		/* 降序
		 * @see org.apache.hadoop.io.WritableComparator#compare(org.apache.hadoop.io.WritableComparable, org.apache.hadoop.io.WritableComparable)
		 */
		@Override
		public int compare(WritableComparable a, WritableComparable b) {
			return -super.compare(a, b);
		}
		
	}
	public static void deletedir(String path){
		try {
			HDFSUtils.DeleteHDFSFile(path);
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	public static void main(String[] args) throws Exception {
		//-
		File jarFile = EJob.createTempJar("bin");
//		EJob.addClasspath("D:/hadoop-1.2.1/conf/conf");
		ClassLoader classLoader = EJob.getClassLoader();
		Thread.currentThread().setContextClassLoader(classLoader);
		
		Configuration conf = new Configuration();
		String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
		if (otherArgs.length != 2) {
			System.err.println("Usage: SortWordCount <in> <out>");
			System.exit(2);
		}
		//运行在真正的集群上
		conf.set("mapred.job.tracker", "192.168.100.150:9001");
		
		Job job = new Job(conf);
		job.setJobName("SortWordCount");
		job.setJarByClass(SortWordCount.class);
		
		job.setOutputKeyClass(IntWritable.class);
		job.setOutputValueClass(Text.class);
		
		job.setMapperClass(SortMap.class);
		job.setReducerClass(SortReduce.class);
		job.setPartitionerClass(SortPartition.class);
		job.setSortComparatorClass(SortComparator.class);
		job.setNumReduceTasks(2);
		
		//-
		((JobConf) job.getConfiguration()).setJar(jarFile.toString());
		
		
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		deletedir(args[1]);
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		System.exit(job.waitForCompletion(true)? 0 : 1);
		
		

	}

}

四、单表关联

思想:表与表之间的自连接 思路:在hadoop shuffle过程的时候会进行合并,左表parent 为key 右表child 为key

package hadoop.v5;

import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import hadoop.utils.HDFSUtils;
import hadoop.v5.Sort.IntKeyDescComparator;
import hadoop.v5.Sort.KeySectionPartitioner;
import hadoop.v5.Sort.Map;
import hadoop.v5.Sort.Reduce;

/**
 * 表与表之间的自连接 思路:在hadoop shuffle过程的时候会进行合并,左表parent 为key 右表child 为key,
 * 这样在shuffle的过程中,并自动合并表一个笛卡尔积
 * 
 * @author : chenhaipeng
 * @date : 2015年9月9日 上午1:48:20
 */
public class SelfJoin {

	public static int time = 0;

	/*
	 * Map 将输入分割成child 和parent ,然后正序输入 一次作为右表,反序输入 一次作为左表,左右表加上标记
	 */
	public static class Map extends Mapper<LongWritable, Text, Text, Text> {

		/*
		 * @see org.apache.hadoop.mapreduce.Mapper#map(java.lang.Object,
		 * java.lang.Object, org.apache.hadoop.mapreduce.Mapper.Context)
		 */
		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			String line = value.toString();
			if (StringUtils.isNotEmpty(line)) {
				String childName = new String();
				String parentName = new String();
				String relationType = new String();
				System.out.println(line);
				int i = 0;
				// 找出字符隔的位置
				while (line.charAt(i) != '\t') {
					i++;
				}
				String[] values = { line.substring(0, i), line.substring(i + 1) };
				// 开头的表头不处理
				if (values[0].compareTo("child") != 0) {
					childName = values[0].trim();
					parentName = values[1].trim();
					// 左右表标记
					relationType = "1";
					context.write(new Text(parentName), new Text(relationType + "+" + childName + "+" + parentName));
					relationType = "2";
					// 这样左右表在shuffle 过程中合并 在一起
					context.write(new Text(childName), new Text(relationType + "+" + childName + "+" + parentName));
				}
			}

		}

	}

	public static class Reduce extends Reducer<Text, Text, Text, Text> {

		/*
		 * @see org.apache.hadoop.mapreduce.Reducer#reduce(java.lang.Object,
		 * java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
		 */
		@Override
		protected void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			if (time == 0) {
				context.write(new Text("grandChild"), new Text("grandParent"));
				time++;
			}
			int grandChildNum = 0;
			String[] grandChild = new String[10];
			int grandParentNum = 0;
			String[] grandParent = new String[10];
			Iterator iter = values.iterator();
			while (iter.hasNext()) {
				String record = iter.next().toString();
//				System.out.println(record);
				int len = record.length();
				int i = 2;
				if (len == 0)
					continue;
				char relationType = record.charAt(0);
				String childname = new String();
				String parnetname = new String();
				while (record.charAt(i) != '+') {
					i++;
				}
				childname = record.substring(2, i);
				parnetname = record.substring(i + 1);
//				System.out.println("childname-->"+childname);
//				System.out.println("parnetname-->"+parnetname);

				// 左表
				if (relationType == '1') {
					grandChild[grandChildNum] = childname;
					grandChildNum++;
				} else {
					grandParent[grandParentNum] = parnetname;
					grandParentNum++;
				}
			}
//			System.out.println(Arrays.asList(grandChild));
//			System.out.println(Arrays.asList(grandParent));
			// 求数组的笛卡尔积
			if (grandParentNum != 0 && grandChildNum != 0) {
				for (int m = 0; m < grandChildNum; m++) {
					for (int n = 0; n < grandParentNum; n++) {
						context.write(new Text(grandChild[m].trim()),new Text(grandParent[n].trim()));
					}
				}
			}
		}

	}

	public static void deletedir(String path) {
		try {
			HDFSUtils.DeleteHDFSFile(path);
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	public static void main(String[] args) throws Exception {

		Configuration conf = new Configuration();
		String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
		if (otherArgs.length != 2) {
			System.err.println("Usage: SelfJon <in> <out>");
			System.exit(2);
		}
		Job job = new Job(conf);
		job.setJarByClass(SelfJoin.class);
		job.setJobName("SelfJoin");
		job.setMapperClass(Map.class);
		job.setReducerClass(Reduce.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
		deletedir(args[1]);
		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);

	}

}

输入 为:

child	parent
Tom		Lucy
Tom		Jack
Jone	Lucy
Lucy	Marry
Lucy	Ben
Jack	Alice
Jack	Jesse
Terry	Alice
Terry	Jesse
Philip	Terry
Philip	Alma
Mark	Terry
Mark	Alma


五、多表关联

思想:与单表关联相似,要提前指定好那个是左表,那个是右表

核心 : 

			//找出数据的分割点
			while(line.charAt(i) >= '9' || line.charAt(i) <= '0'){
				i++;
			}
			
			//代表的是左表
			if(line.charAt(0)>= '9' || line.charAt(0) <= '0'){
				int j = i-1; 
				while(line.charAt(j) != ' ')j--;
				String[] values = {line.substring(0,j),line.substring(i)};
				context.write(new Text(values[1].trim()), new Text("1+"+values[0]));
			}else{
				int j = i+1;
				while(line.charAt(j) != ' ')j++;
				String[] values = {line.substring(0,i+1),line.substring(j)};
				context.write(new Text(values[0].trim()), new Text("2+"+values[1]));
			}

代码:

package hadoop.v5;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import hadoop.utils.HDFSUtils;

/**
 * 多表连接,类似自连接
 * @author : chenhaipeng
 * @date : 2015年9月10日 上午12:33:26
 */
public class MTJoin {
	
	public static int time = 0; 
	
	public static class Map extends Mapper<LongWritable, Text, Text, Text>{

		/* 
		 * @see org.apache.hadoop.mapreduce.Mapper#map(java.lang.Object, java.lang.Object, org.apache.hadoop.mapreduce.Mapper.Context)
		 */
		@Override
		protected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {
			String line = value.toString();
			int i = 0; 
			if(line.contains("factoryname") == true || line.contains("addressID") == true){
				return;
			}
			//找出数据的分割点
			while(line.charAt(i) >= '9' || line.charAt(i) <= '0'){
				i++;
			}
			
			//代表的是左表
			if(line.charAt(0)>= '9' || line.charAt(0) <= '0'){
				int j = i-1; 
				while(line.charAt(j) != ' ')j--;
				String[] values = {line.substring(0,j),line.substring(i)};
				context.write(new Text(values[1].trim()), new Text("1+"+values[0]));
			}else{
				int j = i+1;
				while(line.charAt(j) != ' ')j++;
				String[] values = {line.substring(0,i+1),line.substring(j)};
				context.write(new Text(values[0].trim()), new Text("2+"+values[1]));
			}
			
			
		}
		
	}
	
	public static class Reduce extends Reducer<Text, Text, Text, Text>{

		/* 
		 * @see org.apache.hadoop.mapreduce.Reducer#reduce(java.lang.Object, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
		 */
		@Override
		protected void reduce(Text text, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			if(time == 0){
				context.write(new Text("factoryname"), new Text("addressname"));
				time++;
			}
			int factorynum = 0; 
			String factory[] = new String[10];
			int addressnum  = 0; 
			String address[] = new String[10];
			Iterator iter = values.iterator();
			while(iter.hasNext()){
				String record = iter.next().toString();
				int len = record.length();
				int i = 2;
				char type = record.charAt(0);
				String factoryname = new String();
				String addressname = new String();
				if(type == '1' ){	//左表
					factory[factorynum] = record.substring(2);	
					factorynum++;
				}else{		//右表
					address[addressnum] = record.substring(2);
					addressnum++;
				}
			}
			//
			if(factorynum != 0 && addressnum!= 0){
				for(int m = 0; m < factorynum; m++){
					for(int n = 0; n < addressnum; n++){
						context.write(new Text(factory[m]), new Text(address[n]));
					}
				}
			}
			
		
		}
		
	}
	public static void deletedir(String path) {
		try {
			HDFSUtils.DeleteHDFSFile(path);
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
		if (otherArgs.length != 2) {
			System.err.println("Usage: MTJoin <in> <out>");
			System.exit(2);
		}
		Job job = new Job(conf);
		job.setJarByClass(MTJoin.class);
		job.setJobName("MTJoin");
		job.setMapperClass(Map.class);
		job.setReducerClass(Reduce.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
		deletedir(args[1]);
		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);

	}

}

输入为:

factoryname addressed
Beijing	Red Star 1

addressID addressname
1 Beijing




  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值