package com.zhiyou.bd23.topn;
import java.io.IOException;
import java.util.TreeMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
//全局数据的topN
//在每个map节点求topn,发送到一个reduce节点再求一次topn
public class GloabTopN {
//每个map节点上求自己的数据的topN
public static class GloabTopNMap extends Mapper<Object, Text, IntWritable, Text>{
//定义treemap来保证一个map节点的topN
private TreeMap<Integer, String> top5 = new TreeMap<Integer, String>();
private IntWritable outputKey = new IntWritable();
private Text outputValue = new Text();
private String[] infos;
@Override
protected void map(Object key, Text value, Mapper<Object, Text, IntWritable, Text>.Context context)
throws IOException, InterruptedException {
infos = value.toString().split("\\s+");
//判断如果top5已经有歌曲播放次数和本次处理的歌曲的播放次数重复
if(top5.containsKey(Integer.valueOf(infos[1]))){
//把播放次数相同的歌曲,放到同一个kv的value上,用逗号来间隔不同歌曲
top5.put(Integer.valueOf(infos[1]), top5.get(Integer.valueOf(infos[1]))+","+infos[0]);
}else{
if(infos!=null && infos.length==2){
if(top5.size()==5){
//把当前kv加进去,然后删除最小的
top5.put(Integer.valueOf(infos[1]), infos[0]);
//删除最小的
top5.remove(top5.firstKey());
}else{
//直接添加
top5.put(Integer.valueOf(infos[1]), infos[0]);
}
}
}
}
@Override
protected void cleanup(Mapper<Object, Text, IntWritable, Text>.Context context)
throws IOException, InterruptedException {
if(top5.size()>0){
for(int i: top5.descendingKeySet()){
outputKey.set(i);
outputValue.set(top5.get(i));
context.write(outputKey, outputValue);
}
}
}
}
//1个reduce节点接受每个map的top5,然后再计算top5
public static class GloabTopNReduce extends Reducer<IntWritable, Text, Text, IntWritable>{
private TreeMap<Integer, String> top5 = new TreeMap<Integer, String>();
private Text outputKey = new Text();
private IntWritable outputValue = new IntWritable();
private String musicsName;
@Override
protected void reduce(IntWritable key, Iterable<Text> values,
Reducer<IntWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
musicsName = "";
for(Text value:values){
if(musicsName.equals("")){
musicsName += value.toString();
}else{
musicsName += "," + value.toString();
}
}
//判断top5中是否有和当前播放次数相同的歌曲
if(top5.containsKey(key.get())){
//把歌曲合并到value上
top5.put(key.get(), top5.get(key.get())+","+musicsName);
}else{
//判断top5上有没有5个数据,有的话添加本次kv然后删除最小的一个,否则直接添加
if(top5.size()==5){
top5.put(key.get(), musicsName);
top5.remove(top5.firstKey());
}else{
top5.put(key.get(), musicsName);
}
}
}
@Override
protected void cleanup(Reducer<IntWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
if(top5.size()>0){
for (int k:top5.descendingKeySet()) {
outputKey.set(top5.get(k));
outputValue.set(k);
context.write(outputKey, outputValue);
}
}
}
}
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
job.setJarByClass(GloabTopN.class);
job.setJobName("全局topn");
job.setMapperClass(GloabTopNMap.class);
job.setReducerClass(GloabTopNReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(Text.class);
Path inputPath = new Path(args[0]);
Path outputDir = new Path(args[1]);
outputDir.getFileSystem(configuration).delete(outputDir, true);
FileInputFormat.addInputPath(job, inputPath);
FileOutputFormat.setOutputPath(job, outputDir);
job.setNumReduceTasks(1);
System.exit(job.waitForCompletion(true)?0:1);
}
}
全局数据TopN
最新推荐文章于 2022-12-25 22:33:58 发布