hadoop_MapReduce处理topKey程序

原创 2015年11月21日 18:27:45
hadoop_MapReduce处理topKey程序

 

实例1

输入文本:

需求分析:得出此文本中单词数的最大值,仅输出一行,如:name 61

代码分析:本实例中,只用到了一个map并没有用到reduce,因为输入文件只有一个,所以没有必要再写一个reduce(稍后会列出,多个输入文件,reduce处理)

代码如下:

package com.ucky.topkMapreduce;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class topkMapreduce {

	/**
	 * @author UCKY.Kit
	 * 因为输入文件为一个文件所有已个map任务执行即可,不再需要Reduce处理
	 * 输入的文件,为已经被WordConut技术过的文本
	 */
	static class Map extends Mapper<LongWritable, Text, Text, LongWritable> {

		private LongWritable mapOutputValue = new LongWritable();
		private Text mapOutputKey = new Text();

		//记录最大值并赋初始值
		private long num = Long.MIN_VALUE;

		@Override
		public void setup(
				Mapper<LongWritable, Text, Text, LongWritable>.Context context)
				throws IOException, InterruptedException {
			super.setup(context);
		}

		@Override
		public void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			//获取VALUE
			String strValue = value.toString();
			//切割
			String strs[] = strValue.split("\t");
			
			long temp = Long.valueOf(strs[1]);
			//比较最大值并记录
			if (num < temp) {
				num = temp;
				mapOutputKey.set(strs[0]);
			}
		}

		@Override
		public void cleanup(Context context) throws IOException,
				InterruptedException {
			//设置输出VALUE
			mapOutputValue.set(num);
			context.write(mapOutputKey, mapOutputValue);
		}
	}

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		Job job = new Job(conf, topkMapreduce.class.getSimpleName());

		job.setJarByClass(topkMapreduce.class);
		
		job.setMapperClass(Map.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);

		FileInputFormat.addInputPath(job, new Path(args[0]));
		//将REDUCE数设置为0,默认为1
		job.setNumReduceTasks(0);

		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		job.waitForCompletion(true);

	}
}<strong>
</strong>




实例2

输入文本:

需求分析:得出此文本中单词数的最大值,仅输出最大数的前3行

代码分析:本实例中,只用到了一个map并没有用到reduce,因为输入文件只有一个,所以没有必要再写一个reduce(稍后会列出,多个输入文件,reduce处理)

代码如下:

package com.ucky.topkMapreduce;

import java.io.IOException;
import java.util.TreeSet;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class topkMapreduce2 {

	static class Map extends Mapper<LongWritable, Text, NullWritable, LongWritable> {

		public static final int KEY = 3;
		private LongWritable mapOutputValue = new LongWritable();
		private Text mapOutputKey = new Text();

		private long num = Long.MIN_VALUE;
		private TreeSet<Long> topSet = new TreeSet<Long>();

		@Override
		public void setup(
				Mapper<LongWritable, Text, NullWritable, LongWritable>.Context context)
				throws IOException, InterruptedException {
			super.setup(context);
		}

		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			// get value
			String strValue = value.toString();
			// split
			String strs[] = strValue.split("\t");

			long temp = Long.valueOf(strs[1]);

			topSet.add(temp);

			if (KEY < topSet.size()) {
				topSet.remove(topSet.first());
			}

		}

		@Override
		protected void cleanup(Context context) throws IOException,
				InterruptedException {
			
			for(Long l :topSet){
				mapOutputValue.set(l);
				context.write(NullWritable.get(), mapOutputValue);
			}
		}
	}

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		Job job = new Job(conf, topkMapreduce2.class.getSimpleName());

		job.setJarByClass(topkMapreduce2.class);

		job.setMapperClass(Map.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);

		FileInputFormat.addInputPath(job, new Path(args[0]));

		job.setNumReduceTasks(0);

		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		job.waitForCompletion(true);

	}
}


实例3

输入文本:

需求分析:得出此文本中单词数的最大值,仅输出最大数的前3行(优化)

代码分析:本实例中,只用到了一个map并没有用到reduce,因为输入文件只有一个,所以没有必要再写一个reduce(稍后会列出,多个输入文件,reduce处理)

代码如下:

自定义数据类型TopWritable


package com.ucky.topkMapreduce;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class TopWritable implements WritableComparable<TopWritable> {

	private String word;
	private Long num;

	public TopWritable(){}
	
	public TopWritable(String word,long num){
		set(word, num);
	}
	
	public String getWord() {
		return word;
	}

	public void set(String word, long num) {
		this.word = word;
		this.num = num;
	}

	public Long getNum() {
		return num;
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(word);
		out.writeLong(num);
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		this.word = in.readUTF();
		this.num = in.readLong();

	}

	@Override
	public int compareTo(TopWritable o) {
		int cmp = this.word.compareTo(o.getWord());
		
		if(0!=cmp){//不同
			return cmp;
		}
		
		return this.num.compareTo(o.getNum());
	}

	@Override
	public String toString() {
		return  word + "\t" + num;
	}

	@Override
	public int hashCode() {
		final int prime = 31;
		int result = 1;
		result = prime * result + ((num == null) ? 0 : num.hashCode());
		result = prime * result + ((word == null) ? 0 : word.hashCode());
		return result;
	}

	@Override
	public boolean equals(Object obj) {
		if (this == obj)
			return true;
		if (obj == null)
			return false;
		if (getClass() != obj.getClass())
			return false;
		TopWritable other = (TopWritable) obj;
		if (num == null) {
			if (other.num != null)
				return false;
		} else if (!num.equals(other.num))
			return false;
		if (word == null) {
			if (other.word != null)
				return false;
		} else if (!word.equals(other.word))
			return false;
		return true;
	}

}



topkMapreduce3: 

package com.ucky.topkMapreduce;

import java.io.IOException;
import java.util.Comparator;
import java.util.TreeSet;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class topkMapreduce3 {

	static class Map extends Mapper<LongWritable, Text, Text, LongWritable> {

		public static final int KEY = 3;
		private long num = Long.MIN_VALUE;
		private TreeSet<TopWritable> topSet = new TreeSet<TopWritable>(
				new Comparator<TopWritable>() {
					@Override
					public int compare(TopWritable o1, TopWritable o2) {
						
						return o1.getNum().compareTo(o2.getNum());
					}

				});

		@Override
		public void setup(
				Mapper<LongWritable, Text, Text, LongWritable>.Context context)
				throws IOException, InterruptedException {
			super.setup(context);
		}

		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			// get value
			String strValue = value.toString();
			// split
			String strs[] = strValue.split("\t");

			long temp = Long.valueOf(strs[1]);
			topSet.add(new TopWritable(strs[0], temp));

			if (KEY < topSet.size()) {
				topSet.remove(topSet.first());
			}

		}

		@Override
		protected void cleanup(Context context) throws IOException,
				InterruptedException {

			for (TopWritable l : topSet) {
				context.write(new Text(l.getWord()),new LongWritable(l.getNum()));
			}
		}
	}

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		Job job = new Job(conf, topkMapreduce3.class.getSimpleName());

		job.setJarByClass(topkMapreduce3.class);

		job.setMapperClass(Map.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);

		FileInputFormat.addInputPath(job, new Path(args[0]));

		job.setNumReduceTasks(0);

		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		job.waitForCompletion(true);

	}
}



实例4

输入文本: 

需求分析:得出此文本中单词数的最大值,仅输出最大数的前3行

代码分析:本实例中,为多个文件输入,一个map处理不过来,需要reduce来处理,看完代码,会发现,实际上map就是一个分割字符串传递的工作,前几个例子中,map处理topKey的工作被转移到了,reduce上去做了(自定义数据类型在上面不在此处贴出相应代码)

代码如下:

package com.ucky.topkMapreduce;

import java.io.IOException;
import java.util.Comparator;
import java.util.TreeSet;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class topkMapreduce4 {

	static class Map extends Mapper<LongWritable, Text, Text, LongWritable> {

		private Text mapKey = new Text();
		private LongWritable mapValue = new LongWritable();

		@Override
		public void setup(
				Mapper<LongWritable, Text, Text, LongWritable>.Context context)
				throws IOException, InterruptedException {
			super.setup(context);
		}

		@Override
		public void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			// get value
			String strValue = value.toString();
			// split
			String strs[] = strValue.split("\t");
			mapKey.set(strs[0]);
			mapValue.set(Long.valueOf(strs[1]));
			context.write(mapKey, mapValue);
		}

		@Override
		public void cleanup(Context context) throws IOException,
				InterruptedException {

			super.cleanup(context);
		}
	}
	
	
	

	static class Reduce extends Reducer<Text, LongWritable, Text, LongWritable> {

		public static final int KEY = 3;

		private TreeSet<TopWritable> topSet = new TreeSet<TopWritable>(
				new Comparator<TopWritable>() {
					@Override
					public int compare(TopWritable o1, TopWritable o2) {

						return o1.getNum().compareTo(o2.getNum());
					}

				});

		@Override
		public void setup(
				Reducer<Text, LongWritable, Text, LongWritable>.Context context)
				throws IOException, InterruptedException {
			super.setup(context);
		}

		@Override
		public void reduce(Text key, Iterable<LongWritable> value,
				Context context) throws IOException, InterruptedException {
			long count = 0L;

			for (LongWritable t : value) {
				count += t.get();
			}
			topSet.add(new TopWritable(key.toString(), count));
			
			if (KEY < topSet.size()) {
				topSet.remove(topSet.first());
			}
			
			
		}

		@Override
		public void cleanup(Context context) throws IOException,
				InterruptedException {

			for (TopWritable l : topSet) {
				context.write(new Text(l.getWord()),
						new LongWritable(l.getNum()));
			}
			
		}
	}

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		Job job = new Job(conf, topkMapreduce4.class.getSimpleName());

		job.setJarByClass(topkMapreduce4.class);

		job.setMapperClass(Map.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);
		job.setReducerClass(Reduce.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);
		
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		job.waitForCompletion(true);

	}
}




 

相关文章推荐

使用hadoop实现平均数~并输出top N

本文通过具体的实例,介绍如何使用hadoop中mapreduce程序解决平均数以及top N的相关问题。...

025_MapReduce样例Hadoop TopKey算法

1、需求说明 2、 某个文件中某列数据的最大值。 思路:对每一个列的值依次进行比较,保存最大的值进行输出,算法的思想类似于排序算法(快速和冒泡排序)。 Mapper:因为只是在wordcount统计...

从零开始最短路径学习Hadoop之02----处理气象数据的第一个MapReduce程序

编写一个气象数据挖掘的MapReduce程序 1. 气象数据在哪里?     NCDC  美国国家气候数据中心     获取数据的方式在www.hadoopbook.com里给出了,是这里...

Hadoop 中文编码相关问题 -- mapreduce程序处理GBK编码数据并输出GBK编码数据

输入是GBK文件, 输出也是 GBK 文件的示例代码: Hadoop处理GBK文本时,发现输出出现了乱码,原来HADOOP在涉及编码时都是写死的UTF-8,如果文件编码格式是其它类型(如...
  • zklth
  • zklth
  • 2013年09月19日 13:39
  • 12137

如何在Hadoop的MapReduce程序中处理JSON文件

简介: 尽量在写MapReduce程序处理日志时,需要解析JSON配置文件,简化Java程序和处理逻辑。但是Hadoop本身似乎没有内置对JSON文件的解析功能,我们不得不求助于第三方JSON工具包...

Hadoop 中文编码相关问题 -- mapreduce程序处理GBK编码数据并输出GBK编码数据

最近些统计程序时,使用了原生的java hadoop,以前使用streaming 模式, 用awk脚本写时,没有遇到编码问题,原生的java hadoop程序,在处理gb18030格式log时,当输出...

简单的MapReduce程序(Hadoop2.2.0)

  • 2013年12月18日 17:58
  • 13KB
  • 下载

hadoop运行python编写的mapreduce程序

  • 2016年03月01日 16:38
  • 349B
  • 下载

编写简单的Mapreduce程序并部署在Hadoop2.2.0上运行

经过几天的折腾,终于配置好了Hadoop2.2.0(如何配置在Linux平台部署Hadoop请参见本博客《在Fedora上部署Hadoop2.2.0伪分布式平台》),今天主要来说说怎么在Hadoop2...

HADOOP之MAPREDUCE程序应用二

摘要:MapReduce程序进行单词计数。 关键词:MapReduce程序 单词计数
内容举报
返回顶部
收藏助手
不良信息举报
您举报文章:hadoop_MapReduce处理topKey程序
举报原因:
原因补充:

(最多只允许输入30个字)