第五个MapReduce程序----标准版（TopKey）

最新推荐文章于 2024-07-30 10:39:47 发布

光于前裕于后

最新推荐文章于 2024-07-30 10:39:47 发布

阅读量1.1k

点赞数

分类专栏：大数据动物园 Java

本文链接：https://blog.csdn.net/Dr_Guo/article/details/51149246

版权

大数据动物园同时被 2 个专栏收录

95 篇文章 6 订阅

订阅专栏

Java

13 篇文章 0 订阅

订阅专栏

今天突然发现博客里竟然忘写一个正规的MR

Mapper

package club.drguo.hadoop.mapreduce.topkurl;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class TopKeyURLMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
	private FlowBean bean = new FlowBean();
	private Text k = new Text();

	@Override
	protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, FlowBean>.Context context)
			throws IOException, InterruptedException {
		String line = value.toString();
		String[] strings = StringUtils.split(line, "\t");
		try {
			if (strings.length > 32 && StringUtils.isNotEmpty(strings[26]) && strings[26].startsWith("http")) {
				String url = strings[26];
				long up_flow = Long.parseLong(strings[30]);
				long down_flow = Long.parseLong(strings[31]);
				k.set(url);
				System.out.println("+++++++++++"+up_flow);
				bean.set(up_flow, down_flow);
				context.write(k, bean);
//				System.out.println(url);
			}
		} catch (Exception e) {
			System.out.println("topkeyurlmapper有问题。。。。。");
		}
	}
}

Reducer

package club.drguo.hadoop.mapreduce.topkurl;

import java.io.IOException;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class TopKeyURLReducer extends Reducer<Text, FlowBean, Text, LongWritable>{
	//如果放在reduce里，每放一个url产生一个treemap
	private TreeMap<FlowBean, Text> treeMap = new TreeMap<>();
	//总流量
	private double globalCount = 0;
	//<url,{bean,bean...}>
	@Override
	protected void reduce(Text key, Iterable<FlowBean> values, Context context)
			throws IOException, InterruptedException {
		Text url = new Text(key.toString());
		long up_sum = 0;
		long down_sum = 0;
		for(FlowBean bean : values){
			up_sum += bean.getUp_flow();
			System.out.println("--------------"+up_sum);
			down_sum += bean.getDown_flow();
		}
		FlowBean bean = new FlowBean(up_sum, down_sum);
		globalCount += bean.getSum_flow();
		System.out.println("=============="+globalCount);
		treeMap.put(bean, url);//根据流量排序（flowbean），放到treemap中
	}
	//reduce任务即将退出时被调用一次，setup()是开始时调用一次
	@Override
	protected void cleanup(Reducer<Text, FlowBean, Text, LongWritable>.Context context)
			throws IOException, InterruptedException {
		Set<Entry<FlowBean, Text>> entrySet = treeMap.entrySet();
		double tempCount = 0;
		for(Entry<FlowBean, Text> ent : entrySet){
			//只写入占总流量的百分之八十的网站
			if(tempCount / globalCount < 0.8){
			context.write(ent.getValue(), new LongWritable(ent.getKey().getSum_flow()));
			tempCount += ent.getKey().getSum_flow();
			}else{
				return;
			}
		}
	}
}

Runner

package club.drguo.hadoop.mapreduce.topkurl;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
//club.drguo.hadoop.mapreduce.topkurl.TopKeyURLRunner
public class TopKeyURLRunner extends Configured implements Tool{
	@Override
	public int run(String[] args) throws Exception {
		Configuration configuration = new Configuration();
		Job job = Job.getInstance(configuration);
		job.setJarByClass(TopKeyURLRunner.class);
		
		job.setMapperClass(TopKeyURLMapper.class);
		job.setReducerClass(TopKeyURLReducer.class);
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(FlowBean.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);
		
		FileInputFormat.setInputPaths(job, "hdfs://ns1/flow/srclog");
		FileOutputFormat.setOutputPath(job, new Path("hdfs://ns1/flow/topkurl"));
		
		return job.waitForCompletion(true)?0:1;
	}
	public static void main(String[] args) throws Exception {
		int res = ToolRunner.run(new Configuration(), new TopKeyURLRunner(), args);
		System.exit(res);
	}
}