题目:
输入1、2位两个input文件,输入3为stopwords文件,输入4为输出目录;对input文件按照“(space)\t\n\r\f”进行分词,输出两个input中均出现、且次数较少的单词及数量;并排除掉stopwords中出现的单词,结果按词频降序排序,只展示top20;
输出结果如下:
287 I 44 It 27 But 23 The 17 There 17 He 17 And 15 will 14 good 14 If 12 it. 11 great 10 things 9 This 9 well 9 room 9 feel 8 long 8 You 8 thing
代码总览
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class TopkCommonWords {
public static class MapOne extends Mapper<Object, Text, Text, IntWritable> {
private Set<String> stopwords;
private String localFiles;
private final static IntWritable one = new IntWritable(1);
@Override
public void setup(Context context) throws IOException {
stopwords = new TreeSet<>();
Configuration conf = context.getConfiguration();
localFiles = conf.getStrings("stopwords")[0];
FileSystem fs = FileSystem.get(URI.create(localFiles), conf);
FSDataInputStream hdfsInStream = fs.open(new Path(localFiles));
InputStreamReader isr = new InputStreamReader(hdfsInStream, "utf-8");
String line;
BufferedReader br = new BufferedReader(isr);
while ((line = br.readLine()) != null) {
StringTokenizer itr = new StringTokenizer(line);
while (itr.hasMoreTokens()) {
stopwords.add(itr.nextToken());
}
}
}
@Override
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
FileSplit inputSplit = (FileSplit) context.getInputSplit();
String fileName = inputSplit.getPath().getName();
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
String word = itr.nextToken();
if (!stopwords.contains(word)) {
context.write(new Text(fileName + "\t" + word), one);
}
}
}
}
public static class ReduceOne extends Reducer<Text, IntWritable, Text, IntWritable> {
IntWritable result = new IntWritable();
@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static class MapTwo extends Mapper<LongWritable, Text, IntWritable, Text> {
Map<String,Integer> compSet = new HashMap();
String curKeyFile = null;
@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] data = value.toString().split("\t");
String keyFile = data[0];
String keyword = data[1];
int num = Integer.parseInt(data[2]);
if (compSet.isEmpty()) {
curKeyFile = keyFile;
}
if (keyFile.equals(curKeyFile)) {
compSet.put(keyword, num);
} else {
if (compSet.containsKey(keyword)) {
int sum = compSet.get(keyword);
if (sum <= num) {
context.write(new IntWritable(sum), new Text(keyword));
} else {
context.write(new IntWritable(num), new Text(keyword));
}
}
}
}
}
public static class ReduceTwo extends Reducer<IntWritable, Text, IntWritable, Text> {
private static final int maxNum = 20;
private static List<Map<Integer, String>> list = new ArrayList();
@Override
protected void reduce(IntWritable key, Iterable<Text> values, Context context) {
for (Text text : values) {
Map map = new HashMap();
map.put(key.get(), text.toString());
list.add(map);
if(list.size() > maxNum){
list.remove(list.size() - 1);
}
}
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
String path = context.getConfiguration().get("topKout");
System.out.println("cleanup:" + path);
System.out.println("size:" + list.size());
for (Map<Integer, String> map : list) {
for (Map.Entry<Integer, String> entry : map.entrySet()) {
context.write(new IntWritable(entry.getKey()), new Text(entry.getValue()));
}
}
}
}
public static class Sort extends IntWritable.Comparator{
@Override
public int compare(WritableComparable a, WritableComparable b){
return -super.compare(a, b);
}
@Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
return -super.compare(b1, s1, l1, b2, s2, l2);
}
}
public static void main(String[] args) throws Exception {
String out = args[3];
String tmpout = "";
if (out.endsWith("/")) {
tmpout = out.substring(0, out.length() - 2) + "tmp/";
}
Configuration conf1 = new Configuration(true);
conf1.setStrings("stopwords", args[2]);
// job1
System.out.println("job1");
Job job1 = Job.getInstance(conf1, "world count");
job1.setJarByClass(TopkCommonWords.class);
job1.setMapperClass(MapOne.class);
job1.setReducerClass(ReduceOne.class);
job1.setInputFormatClass(TextInputFormat.class);
FileInputFormat.addInputPath(job1, new Path(args[0]));
FileInputFormat.addInputPath(job1, new Path(args[1]));
job1.setOutputKeyClass(Text.class);
job1.setOutputValueClass(IntWritable.class);
FileOutputFormat.setOutputPath(job1, new Path(tmpout));
if(job1.waitForCompletion(true)) {
//job2
System.out.println("job2");
Configuration conf2 = new Configuration(true);
conf2.set("topKout", out);
Job job2 = Job.getInstance(conf2, "sort");
job2.setJarByClass(TopkCommonWords.class);
job2.setMapperClass(MapTwo.class);
job2.setReducerClass(ReduceTwo.class);
job2.setInputFormatClass(TextInputFormat.class);
job2.setSortComparatorClass(Sort.class);
FileInputFormat.addInputPath(job2, new Path(tmpout));
job2.setOutputKeyClass(IntWritable.class);
job2.setOutputValueClass(Text.class);
FileOutputFormat.setOutputPath(job2, new Path(out));
if (job2.waitForCompletion(true) ) {
FileSystem fs = FileSystem.get(URI.create(tmpout), conf2);
fs.delete(new Path(tmpout), true);
}
}
}
}