topk在mapreduce下面的统计加排序的实现

最新推荐文章于 2020-12-23 18:47:13 发布

倪石

最新推荐文章于 2020-12-23 18:47:13 发布

阅读量348

点赞数

分类专栏： Hadoopd 文章标签： mapreduce

本文链接：https://blog.csdn.net/nextstepfans/article/details/74331690

版权

Hadoopd 专栏收录该内容

11 篇文章 0 订阅

订阅专栏

给定的原始数据集如下:

All of us have read thrilling stories in which the hero had only a limited and specified time to live. Sometimes it was as long as a year, sometimes as short as 24 hours. But always we were interested in discovering just how the doomed hero chose to spend his last days or his last hours. I speak, of course, of free men who have a choice, not condemned criminals whose sphere of activities is strictly delimited.

运行以后的统计结果如下

of 21 a 20 us 15 等等

一共写了2个mapreduce 第一个用来统计单词的总次数,第二个用来进行排序

首先自定义了一个class MyInt

package topk;

public class MyInt implements Comparable<MyInt>{
	
	private Integer value;

	public MyInt(Integer value) {

		this.value = value;
	}

	public Integer getValue() {
		return value;
	}

	public void setValue(Integer value) {
		this.value = value;
	}

	@Override
	public int compareTo(MyInt o) {
		// TODO Auto-generated method stub
		return value.compareTo(o.getValue());
	}
	

}

第一部分,第一个mapreduce

package topk;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
							
public class top {
	public static class Map extends Mapper<LongWritable, Text, Text, IntWritable>{
		IntWritable count  = new IntWritable(1);
		@Override
		protected void map(LongWritable key, Text value,  Context context)
				throws IOException, InterruptedException {
			StringTokenizer st = new StringTokenizer(value.toString());
			while(st.hasMoreTokens()){
				String word  = st.nextToken().replaceAll("/", "").replace("'", "").replace(".", "");
				context.write(new Text(word), count);
			}
			
		}
	}
	
	public static class Reducer extends org.apache.hadoop.mapreduce.Reducer<Text, IntWritable, Text, IntWritable>{
		@Override
		protected void reduce(Text key, Iterable<IntWritable> values,Context context)
						throws IOException, InterruptedException {
			int count = 0;
			for(IntWritable word :values){
				count++;
			}
			context.write(key, new IntWritable(count));
		}
	}
	@SuppressWarnings("deprecation")
	public static boolean run(String in, String out) throws IOException,ClassNotFoundException,InterruptedException {
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);
		job.setJarByClass(top.class);
		job.setMapperClass(Map.class);
		job.setReducerClass(Reducer.class);
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		
	    job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		FileInputFormat.addInputPath(job, new Path(in));
		FileOutputFormat.setOutputPath(job, new Path(out));
		return job.waitForCompletion(true);
		
		
		
	}

}

第二部分,第二个mapreduce

package topk;

import java.io.IOException;
import java.util.Comparator;
import java.util.Map.Entry;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class topk {
	public static class Map extends Mapper<Object, Text, IntWritable, Text>{
		
		IntWritable outKey  = new IntWritable();
		Text outValue  = new Text();
		@Override
		protected void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {
			StringTokenizer st = new StringTokenizer(value.toString());
			while(st.hasMoreTokens()){
				String element = st.nextToken();
				if(Pattern.matches("\\d+", element)){
					//用来匹配单词的个数
					outKey.set(Integer.parseInt(element));
				}else{
					outValue.set(element);
				}
			}
			context.write(outKey, outValue);
		}
	}
	
	public static class Reducer extends org.apache.hadoop.mapreduce.Reducer<IntWritable, Text, Text, IntWritable>{
		private static MultipleOutputs<Text,IntWritable> mos = null;
		private static final int k = 10;
		private static TreeMap<MyInt,String> tm = new TreeMap<MyInt,String>(new Comparator<MyInt>(){
			@Override
			public int compare(MyInt o1,MyInt o2) {
			     return o2.compareTo(o1);
			}
		});
		protected void reduce(IntWritable key, java.lang.Iterable<Text> values, Context context) throws IOException ,InterruptedException {
			for(Text text:values){
				context.write(text, key);
				tm.put(new MyInt(key.get()), text.toString());
				if(tm.size() > k){
					
				}
			}
		}
		@Override
		protected void cleanup(
				org.apache.hadoop.mapreduce.Reducer<IntWritable, Text, Text, IntWritable>.Context context)
						throws IOException, InterruptedException {
			String path = context.getConfiguration().get("topKout");
			mos = new MultipleOutputs<Text,IntWritable>(context);
			Set<Entry<MyInt, String>> set = tm.entrySet();
			for (Entry<MyInt, String> entry : set) {
				mos.write("topKMOS", new Text(entry.getValue()), new IntWritable(entry.getKey().getValue()), path);
		     }
			 mos.close();
		}
	}
	 @SuppressWarnings("deprecation")
	    public static void run(String in, String out,String topKout) throws IOException,
	            ClassNotFoundException, InterruptedException {

	        
	        Configuration conf = new Configuration();
	        
	        //前K个词要输出到哪个目录
	        conf.set("topKout",topKout);
	        
	        Job job = new Job(conf, "Sort");
	        job.setJarByClass(topk.class);
	        job.setMapperClass(Map.class);
	        job.setReducerClass(Reducer.class);

	        // 设置Map输出类型
	        job.setMapOutputKeyClass(IntWritable.class);
	        job.setMapOutputValueClass(Text.class);

	        // 设置Reduce输出类型
	        job.setOutputKeyClass(Text.class);
	        job.setOutputValueClass(IntWritable.class);

	        //设置MultipleOutputs的输出格式
	        //这里利用MultipleOutputs进行对文件输出
	       
	        MultipleOutputs.addNamedOutput(job,"topKMOS",TextOutputFormat.class,Text.class,Text.class);
	        
	        // 设置输入和输出目录
	        FileInputFormat.addInputPath(job, new Path(in));
	        FileOutputFormat.setOutputPath(job, new Path(out));
	        job.waitForCompletion(true);

	    }

}

第三部分,写一个主函数来调用上面二个mapreduce

package topk;

import java.io.IOException;

public class topkmain {
	
public static void main(String args[]) throws ClassNotFoundException, IOException, InterruptedException{
        
        //要统计字数，排序的文字
        String in = "C:/danci.txt";
        
        //统计字数后的结果
        String wordCout = "C:/outaa/wordCout";
        
        //对统计完后的结果再排序后的内容
        String sort = "C:/outaa/sort";
        
        //前K条
        String topK = "C:/outaa/shuchudejieguo";
        
        //如果统计字数的job完成后就开始排序
        if(top.run(in, wordCout)){
            topk.run(wordCout, sort,topK);
        }
        
    }
	

}