给定的原始数据集如下:
All of us have read thrilling stories in which the hero had only a limited and specified time to live. Sometimes it was as long as a year, sometimes as short as 24 hours. But always we were interested in discovering just how the doomed hero chose to spend his last days or his last hours. I speak, of course, of free men who have a choice, not condemned criminals whose sphere of activities is strictly delimited.
运行以后的统计结果如下
of 21 a 20 us 15 等等
一共写了2个mapreduce 第一个用来统计单词的总次数,第二个用来进行排序
首先自定义了一个class MyInt
package topk;
public class MyInt implements Comparable<MyInt>{
private Integer value;
public MyInt(Integer value) {
this.value = value;
}
public Integer getValue() {
return value;
}
public void setValue(Integer value) {
this.value = value;
}
@Override
public int compareTo(MyInt o) {
// TODO Auto-generated method stub
return value.compareTo(o.getValue());
}
}
第一部分,第一个mapreduce
package topk;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class top {
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable>{
IntWritable count = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
StringTokenizer st = new StringTokenizer(value.toString());
while(st.hasMoreTokens()){
String word = st.nextToken().replaceAll("/", "").replace("'", "").replace(".", "");
context.write(new Text(word), count);
}
}
}
public static class Reducer extends org.apache.hadoop.mapreduce.Reducer<Text, IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values,Context context)
throws IOException, InterruptedException {
int count = 0;
for(IntWritable word :values){
count++;
}
context.write(key, new IntWritable(count));
}
}
@SuppressWarnings("deprecation")
public static boolean run(String in, String out) throws IOException,ClassNotFoundException,InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(top.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(in));
FileOutputFormat.setOutputPath(job, new Path(out));
return job.waitForCompletion(true);
}
}
第二部分,第二个mapreduce
package topk;
import java.io.IOException;
import java.util.Comparator;
import java.util.Map.Entry;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class topk {
public static class Map extends Mapper<Object, Text, IntWritable, Text>{
IntWritable outKey = new IntWritable();
Text outValue = new Text();
@Override
protected void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
StringTokenizer st = new StringTokenizer(value.toString());
while(st.hasMoreTokens()){
String element = st.nextToken();
if(Pattern.matches("\\d+", element)){
//用来匹配单词的个数
outKey.set(Integer.parseInt(element));
}else{
outValue.set(element);
}
}
context.write(outKey, outValue);
}
}
public static class Reducer extends org.apache.hadoop.mapreduce.Reducer<IntWritable, Text, Text, IntWritable>{
private static MultipleOutputs<Text,IntWritable> mos = null;
private static final int k = 10;
private static TreeMap<MyInt,String> tm = new TreeMap<MyInt,String>(new Comparator<MyInt>(){
@Override
public int compare(MyInt o1,MyInt o2) {
return o2.compareTo(o1);
}
});
protected void reduce(IntWritable key, java.lang.Iterable<Text> values, Context context) throws IOException ,InterruptedException {
for(Text text:values){
context.write(text, key);
tm.put(new MyInt(key.get()), text.toString());
if(tm.size() > k){
}
}
}
@Override
protected void cleanup(
org.apache.hadoop.mapreduce.Reducer<IntWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
String path = context.getConfiguration().get("topKout");
mos = new MultipleOutputs<Text,IntWritable>(context);
Set<Entry<MyInt, String>> set = tm.entrySet();
for (Entry<MyInt, String> entry : set) {
mos.write("topKMOS", new Text(entry.getValue()), new IntWritable(entry.getKey().getValue()), path);
}
mos.close();
}
}
@SuppressWarnings("deprecation")
public static void run(String in, String out,String topKout) throws IOException,
ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
//前K个词要输出到哪个目录
conf.set("topKout",topKout);
Job job = new Job(conf, "Sort");
job.setJarByClass(topk.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reducer.class);
// 设置Map输出类型
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(Text.class);
// 设置Reduce输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//设置MultipleOutputs的输出格式
//这里利用MultipleOutputs进行对文件输出
MultipleOutputs.addNamedOutput(job,"topKMOS",TextOutputFormat.class,Text.class,Text.class);
// 设置输入和输出目录
FileInputFormat.addInputPath(job, new Path(in));
FileOutputFormat.setOutputPath(job, new Path(out));
job.waitForCompletion(true);
}
}
第三部分,写一个主函数来调用上面二个mapreduce
package topk;
import java.io.IOException;
public class topkmain {
public static void main(String args[]) throws ClassNotFoundException, IOException, InterruptedException{
//要统计字数,排序的文字
String in = "C:/danci.txt";
//统计字数后的结果
String wordCout = "C:/outaa/wordCout";
//对统计完后的结果再排序后的内容
String sort = "C:/outaa/sort";
//前K条
String topK = "C:/outaa/shuchudejieguo";
//如果统计字数的job完成后就开始排序
if(top.run(in, wordCout)){
topk.run(wordCout, sort,topK);
}
}
}