import java.util.Random;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.InverseMapper;
import org.apache.hadoop.mapred.lib.LongSumReducer;
import org.apache.hadoop.mapred.lib.RegexMapper;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
//从input中提取与表达式相符的单词并计算词频
public class Grep extends Configured implements Tool {
public Grep()
{
// singleton
}
public static void main(String[] args) throws Exception {
int res=ToolRunner.run(new Configuration(), new Grep(),args);
System.exit(res);
}
@Override
public int run(String[] args) throws Exception {
if (args.length < 3) {
System.out.println("Grep <indir> <outdir> <regex> [<group>]");
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
//创建临时路径
Path tempdir = new Path("grep-temp-"+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
//创建一次计算任务job
JobConf grepjob = new JobConf(getConf(), Grep.class);
try {
grepjob.setJobName("grep-search");
//设置输入目录
FileInputFormat.setInputPaths(grepjob, args[0]);
grepjob.setMapperClass(RegexMapper.class);
//输入的正在表达式
grepjob.set("mapred.mapper.regex", args[2]);
if (args.length == 4) {
grepjob.set("mapred.mapper.regex.group", args[3]);
}
//合成
grepjob.setCombinerClass(LongSumReducer.class);
//规约
grepjob.setReducerClass(LongSumReducer.class);
//输出到临时路径
FileOutputFormat.setOutputPath(grepjob, tempdir);
grepjob.setOutputFormat(SequenceFileOutputFormat.class);
//设置输出键类型
grepjob.setOutputKeyClass(Text.class);
//设置输出值类型
grepjob.setOutputValueClass(LongWritable.class);
//运行job
JobClient.runJob(grepjob);
JobConf sortJob = new JobConf(getConf(), Grep.class);
sortJob.setJobName("grep-sort");
//排序任务的输入路径
FileInputFormat.setInputPaths(sortJob, tempdir);
sortJob.setInputFormat(SequenceFileInputFormat.class);
//倒排map InverseMapper
sortJob.setMapperClass(InverseMapper.class);
// write a single file
sortJob.setNumReduceTasks(1);
//输出路径
FileOutputFormat.setOutputPath(sortJob, new Path(args[1]));
// sort by decreasing freq
sortJob.setOutputKeyComparatorClass(LongWritable.DecreasingComparator.class);
//执行任务
JobClient.runJob(sortJob);
System.out.println(tempdir.toString());
} finally {
FileSystem.get(grepjob).delete(tempdir, true);
}
return 0;
}
}
hadoop学习笔记之grep
最新推荐文章于 2022-08-29 19:01:08 发布