Hadoop读书笔记系列文章:http://blog.csdn.net/caicongyang/article/category/2166855(系列文章会逐步修整完成,添加数据文件格式预计相关注释)
1.说明:
从给定的文件中的找到最大的100个值,给定的数据文件格式如下:
533
16565
17800
2929
11374
9826
6852
20679
18224
21222
8227
5336
912
29525
3382
2100
10673
12284
31634
27405
18015
...
2.下文代码中使用到TreeMap类,所以先写一个demo
TreeMapDemo.java
package suanfa;
import java.util.Map.Entry;
import java.util.TreeMap;
public class TreeMapDemo {
public static void main(String[] args) {
TreeMap<Long, Long> tree = new TreeMap<Long, Long>();
tree.put(1333333L, 1333333L);
tree.put(1222222L, 1222222L);
tree.put(1555555L, 1555555L);
tree.put(1444444L, 1444444L);
for (Entry<Long, Long> entry : tree.entrySet()) {
System.out.println(entry.getKey()+":"+entry.getValue());
}
System.out.println(tree.firstEntry().getValue()); //最小值
System.out.println(tree.lastEntry().getValue()); //最大值
System.out.println(tree.navigableKeySet()); //从小到大的正序key集合
System.out.println(tree.descendingKeySet());//从大到小的倒序key集合
}
}
3.MapReduce代码
TopKAapp.java
package suanfa;
import java.io.IOException;
import java.net.URI;
import java.util.TreeMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
/**
*
* <p>
* Title: TopKAapp.java Package suanfa
* </p>
* <p>
* Description: 从算1000w个数据中找到最大的100个数
* <p>
*
* @author Tom.Cai
* @created 2014-12-10 下午10:56:44
* @version V1.0
*
*/
public class TopKAapp {
private static final String INPUT_PATH = "hdfs://192.168.80.100:9000/topk_input";
private static final String OUT_PATH = "hdfs://192.168.80.100:9000/topk_out";
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
final FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), conf);
final Path outPath = new Path(OUT_PATH);
if (fileSystem.exists(outPath)) {
fileSystem.delete(outPath, true);
}
final Job job = new Job(conf, TopKAapp.class.getSimpleName());
FileInputFormat.setInputPaths(job, INPUT_PATH);
job.setMapperClass(MyMapper.class);
job.setPartitionerClass(HashPartitioner.class);
job.setNumReduceTasks(1);
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(LongWritable.class);
FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));
job.setOutputFormatClass(TextOutputFormat.class);
job.waitForCompletion(true);
}
static class MyMapper extends Mapper<LongWritable, Text, NullWritable, LongWritable> {
public static final int K = 100;
private TreeMap<Long, Long> tree = new TreeMap<Long, Long>();
public void map(LongWritable key, Text text, Context context) throws IOException, InterruptedException {
long temp = Long.parseLong(text.toString());
tree.put(temp, temp);
if (tree.size() > K)
tree.remove(tree.firstKey());
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
for (Long text : tree.values()) {
context.write(NullWritable.get(), new LongWritable(text));
}
}
}
static class MyReducer extends Reducer<NullWritable, LongWritable, NullWritable, LongWritable> {
public static final int K = 100;
private TreeMap<Long, Long> tree = new TreeMap<Long, Long>();
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
for (Long val : tree.descendingKeySet()) {
context.write(NullWritable.get(), new LongWritable(val));
}
}
@Override
protected void reduce(NullWritable key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
for (LongWritable value : values) {
tree.put(value.get(), value.get());
if (tree.size() > K)
tree.remove(tree.firstKey());
}
}
}
}
欢迎大家一起讨论学习!有用的自己收!
记录与分享,让你我共成长!
欢迎查看我的其他博客;
我的个人博客:http://blog.caicongyang.com;
我的CSDN博客地址:http://blog.csdn.net/caicongyang;