package org.apache.hadoop.examples;
import java.util.Random;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.mapred.lib.*;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/* 对输入文件按正则表达式查找,把结果写到输出文件上。
查找用到了RegexMapper,LongSumReducer,InverseMapper系统自带的
工具类。这个源代码运行了两个job,一个查找,一个是排序。
Extracts matching regexs from input files and counts them. */
public class Grep extends Configured implements Tool {
private Grep() {} // 单例模式singleton
public int run(String[] args) throws Exception {
if (args.length < 3) {
System.out.println("Grep <inDir> <outDir> <regex> [<group>]");
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
Path tempDir =
new Path("grep-temp-"+
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
JobConf grepJob = new JobConf(getConf(), Grep.class);
try {
grepJob.setJobName("grep-search");
FileInputFormat.setInputPaths(grepJob, args[0]);
grepJob.setMapperClass(RegexMapper.class);//设置系统自带的mapper类来查找
grepJob.set("mapred.mapper.regex", args[2]);
if (args.length == 4)
grepJob.set("mapred.mapper.regex.group", args[3]);
grepJob.setCombinerClass(LongSumReducer.class);//设置系统自带的reducer来做合并
grepJob.setReducerClass(LongSumReducer.class);
//设置系统自带的reducer。
FileOutputFormat.setOutputPath(grepJob, tempDir);
grepJob.setOutputFormat(SequenceFileOutputFormat.class);//设置输出格式是二进制文件
grepJob.setOutputKeyClass(Text.class);//输出的key是Text类型
grepJob.setOutputValueClass(LongWritable.class);//输出的value是long类型
JobClient.runJob(grepJob);
JobConf sortJob = new JobConf(Grep.class);
sortJob.setJobName("grep-sort");
FileInputFormat.setInputPaths(sortJob, tempDir);
sortJob.setInputFormat(SequenceFileInputFormat.class);//设置输入的文件格式二进制文件
sortJob.setMapperClass(InverseMapper.class);//设置自带的排序mapper
sortJob.setNumReduceTasks(1); // write a single file
FileOutputFormat.setOutputPath(sortJob, new Path(args[1]));
sortJob.setOutputKeyComparatorClass // 输出结果是降序排列sort by decreasing freq
(LongWritable.DecreasingComparator.class);
JobClient.runJob(sortJob);
}
finally {
FileSystem.get(grepJob).delete(tempDir, true);
}
return 0;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new Grep(), args);
System.exit(res);
}
}
Grep源代码注释
最新推荐文章于 2021-05-12 13:40:43 发布