数据算法（TopN） :MapReduce+Spark(java)实现（键唯一情况）

最新推荐文章于 2024-06-02 11:03:56 发布

acm160920007

最新推荐文章于 2024-06-02 11:03:56 发布

阅读量816

点赞数

分类专栏：数据算法

本文链接：https://blog.csdn.net/acm160920007/article/details/81533311

版权

数据算法专栏收录该内容

5 篇文章 0 订阅

订阅专栏

MapReduce实现

Driver类

package cn.weida.MapReduce.ToN;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;

import Util.HadoopUtil;


/**
 * 一个简单的TopN 默认选区前十  不考虑重复
 * 使用TreeMap数据类型进行自动排序
 * 将各个Map函数处理，得到本地TopN ,在将数据传到Reducer(一个) 再选最终的TopN
 * @author acm160920007
 * 
 * 下午1:40:22  2018年8月8日
 *
 */
public class TopNDriver extends Configured implements Tool {

	private static Logger THE_LOGGER = Logger.getLogger(TopNDriver.class);

	@Override
	public int run(String[] arg0) throws Exception {
		Job job = new Job(getConf());
		HadoopUtil.addJarsToDistributedCache(job, "/lib/");
		int N = Integer.parseInt(arg0[0]);
		job.getConfiguration().setInt("N", N);
		job.setJobName("Top N");

		job.setInputFormatClass(SequenceFileInputFormat.class);
		job.setOutputFormatClass(SequenceFileOutputFormat.class);
		// set map out (k,v)
		job.setMapOutputKeyClass(NullWritable.class);
		job.setMapOutputValueClass(Text.class);
		// set value (k,v)
		job.setOutputKeyClass(IntWritable.class);
		job.setOutputValueClass(Text.class);

		job.setJarByClass(TopNDriver.class);
		job.setMapperClass(TopN_Mapper.class);
		job.setReducerClass(TopN_Reducer.class);
		job.setNumReduceTasks(1);                 //设置一个Reducer任务

		Path input = new Path(arg0[1]);
		Path output = new Path(arg0[2]);

		FileInputFormat.setInputPaths(job, input);
		FileOutputFormat.setOutputPath(job, output);

		boolean status = job.waitForCompletion(true);
		THE_LOGGER.info("run(): status=" + status);
		return status ? 0 : 1;
	}

	public static void main(String[] args) throws Exception {
		if (args.length != 3) {
			THE_LOGGER.warn("usage TopNDriver <N> <input> <output>");
			System.exit(1);
		}

		THE_LOGGER.info("N=" + args[0]);
		THE_LOGGER.info("inputDir=" + args[1]);
		THE_LOGGER.info("outputDir=" + args[2]);
		int returnStatus = ToolRunner.run(new TopNDriver(), args);
		System.exit(returnStatus);
	}

}

Maper类

package cn.weida.MapReduce.ToN;

import java.io.IOException;
import java.util.SortedMap;
import java.util.TreeMap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class TopN_Mapper extends Mapper<LongWritable, Text, NullWritable, Text>{

	//定义本地top 10 所需的数据结构
	private SortedMap<Integer, String> TopNcats = new TreeMap<Integer,String>();
	private int N = 10;    //默认 top 10
	
	@Override
	protected void cleanup(Mapper<LongWritable, Text, NullWritable, Text>.Context context)
			throws IOException, InterruptedException {
		for (String catAttributes : TopNcats.values()) {
			context.write(NullWritable.get(), new Text(catAttributes));
		}
	}

	@Override
	protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, NullWritable, Text>.Context context)
			throws IOException, InterruptedException {
		String[] tokens = value.toString().trim().split(",");
		Integer weight = Integer.parseInt(tokens[0]);
		TopNcats.put(weight, value.toString());
		if (TopNcats.size()>N) {
			
			//TopNcats.remove(TopNcats.lastKey()); 查找Botton N
			TopNcats.remove(TopNcats.firstKey());    //查找Top N
		}
	}

	@Override
	protected void setup(Mapper<LongWritable, Text, NullWritable, Text>.Context context)
			throws IOException, InterruptedException {
		Configuration conf = context.getConfiguration();
		this.N = conf.getInt("N",10);
	}

}

Reducer类

package cn.weida.MapReduce.ToN;

import static org.mockito.Matchers.contains;
import static org.mockito.Mockito.inOrder;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.SortedMap;
import java.util.TreeMap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class TopN_Reducer extends Reducer<NullWritable, Text, IntWritable, Text>{

	private int N = 10;
	private SortedMap<Integer, String> FinaltopN = new TreeMap<Integer,String>();
	
	@Override
	protected void reduce(NullWritable key, Iterable<Text> values,
			Reducer<NullWritable, Text, IntWritable, Text>.Context Context) throws IOException, InterruptedException {
		for (Text catRecord:values) {
			String[] tokens = catRecord.toString().trim().split(",");
			int frequency =  Integer.parseInt(tokens[1]);
			String url = tokens[0];
			FinaltopN.put(frequency, url);
			if (FinaltopN.size()>N) {
				//TopNcats.remove(TopNcats.lastKey()); 查找Botton N
				FinaltopN.remove(FinaltopN.firstKey());   //查找Top N
			}
		}
		
		 List<Integer> keys = new ArrayList<Integer>(FinaltopN.keySet());
		for (int i =keys.size()-1;i>=0;i--) {
			Context.write(new IntWritable(keys.get(i)), new Text(FinaltopN.get(keys.get(i))));
		}
		
	}

	@Override
	protected void setup(Reducer<NullWritable, Text, IntWritable, Text>.Context context)
			throws IOException, InterruptedException {
		Configuration conf = context.getConfiguration();
		this.N = conf.getInt("N",10);
	}
	

}

Spark实现

package cn.weida.Spark.TopN;

import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;

import org.apache.commons.httpclient.CircularRedirectException;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.broadcast.Broadcast;
import org.stringtemplate.v4.compiler.STParser.element_return;

import com.codahale.metrics.Timer.Context;

import scala.Tuple2;

/**
 * 对于输入的键唯一的情况    也就是不会考虑 （A，5） 和 （A，10）同时出现的情况
 *
 * 要求输入N大于0
 * 
 *  Top-10 Design Pattern: “Top Ten” Structure 
 * 
 *    class mapper : 
 *         setup(): 创建本地topN 
 *         map(key, record ): 
 *                       Insert record into top ten sorted list if length of array 
 *                       is greater than 10.
 *                       Truncate list to a length of 10.
 *         cleanup() : 输出
 *
 *    class reducer: 
 *               setup(): initialize top ten sorted list 
 *               reduce(key, records): sort records 
 *                                     truncate records to top 10 
 *             
 * @author acm160920007
 * 
 * 上午10:25:43  2018年8月9日
 **/
public class TopN {

	public static void main(String[] args) {
		if (args.length != 3) {
			System.out.println("Usage:TopN N [top/bottom] <hdfs-file>");
			System.exit(1);
		}
		int topN = Integer.parseInt(args[0]);
		String direction = args[1];
		if (!(direction.equals("top")||direction.equals("bottom"))) {
			System.out.println("Usage:TopN N [top/bottom] <hdfs-file>");
			System.exit(1);
		}
		
		String inputPath = args[2];
		System.out.println("inputPath : <hdfs-file>=" + inputPath);
		
		
		JavaSparkContext ctx = new JavaSparkContext();
		
		Broadcast<Integer> broadcastTopN = ctx.broadcast(topN);
		Broadcast<String> broadcastDirection = ctx.broadcast(direction);

		// 输入
		JavaRDD<String> line = ctx.textFile(inputPath, 1);

		// （String） -> (String,Integer) 输入 输出key 输出 value
		JavaPairRDD<String, Integer> pairs = line.mapToPair(new PairFunction<String, String, Integer>() {
			public Tuple2<String, Integer> call(String s) {
				String[] tokens = s.split(",");
				return new Tuple2<String, Integer>(tokens[0], Integer.parseInt(tokens[1]));
			}
		});

		//创建一个本地的topN
		JavaRDD<SortedMap<Integer, String>> partitions = pairs
				.mapPartitions(new FlatMapFunction<Iterator<Tuple2<String, Integer>>, SortedMap<Integer, String>>() {
					@Override
					public Iterator<SortedMap<Integer, String>> call(Iterator<Tuple2<String, Integer>> iter) {
						SortedMap<Integer, String> topN = new TreeMap<Integer, String>(); // 等价 setup()
						while (iter.hasNext()) { // 等价map()
							Tuple2<String, Integer> tuple = iter.next();
							topN.put(tuple._2, tuple._1);
							if (topN.size() > broadcastTopN.value()) {
								if (broadcastDirection.toString().equals("top")) {
									topN.remove(topN.firstKey());
								} else if (broadcastDirection.equals("bottom")) {
									topN.remove(topN.lastKey());
								}
 								
							}
						}
						return Collections.singletonList(topN).iterator(); // 等价clearup()
					}
				});
		
		
		//所有本地topN 创建最终TopN
		SortedMap<Integer, String> finaltopN = new TreeMap<Integer,String>();
		List<SortedMap<Integer,String>> alltopN = partitions.collect();
		for (SortedMap<Integer, String> localtopN :alltopN) {
			for (Map.Entry<Integer, String> entry : localtopN.entrySet()) {
				finaltopN.put(entry.getKey(), entry.getValue());
				if (finaltopN.size()>broadcastTopN.value()) {
						if (broadcastDirection.toString().equals("top")) {
							finaltopN.remove(finaltopN.firstKey());
						} else if (broadcastDirection.equals("bottom")) {
							finaltopN.remove(finaltopN.lastKey());
						}
							
					}
				}
			}
		
		
		for (Map.Entry<Integer, String> entry : finaltopN.entrySet()) {
	         System.out.println(entry.getKey() + "--" + entry.getValue());
	      }
		
		System.exit(0);
	}
	
	 

}

acm160920007

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
数据算法（TopN） :MapReduce+Spark(java)实现（键唯一情况）

MapReduce实现Driver类package cn.weida.MapReduce.ToN;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io...
复制链接

扫一扫

专栏目录