需求:给出几个文件,分别把每个单词总数、在哪个文件单词总数计算出来
test1文件:
zsy test
zsy tom
zsy test
test2文件:
tom test
zsy tom
zsy cat
test3文件:
cat test
cat tom
test cat
第一步:
把每个文件单词和文件名组合为key,value是单词个数
package com.zsy.mr.inverindex;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class InverIndexStepOne {
static class InverIndexStepOneMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
Text k = new Text();
IntWritable val = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
String[] words = value.toString().split(" ");
FileSplit split = (FileSplit) context.getInputSplit();
String name = split.getPath().getName();
for (String word : words) {
k.set(word + "--"+name);
context.write(k ,val);
}
}
}
static class InverIndexStepOneReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int count = 0 ;
for (IntWritable intWritable : values) {
count += intWritable.get();
}
context.write(key, new IntWritable(count));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
/*conf.set("mapreduce.framework.name", "yarn");
conf.set("yarn.resoucemanger.hostname", "hadoop01");*/
Job job = Job.getInstance(conf);
job.setJarByClass(InverIndexStepOne.class);
//指定本业务job要使用的业务类
job.setMapperClass(InverIndexStepOneMapper.class);
job.setReducerClass(InverIndexStepOneReducer.class);
//指定mapper输出的k v类型 如果map的输出和reduce的输出一样,只需要设置输出即可
//job.setMapOutputKeyClass(Text.class);
//job.setMapOutputValueClass(IntWritable.class);
//指定最终输出kv类型(reduce输出类型)
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//指定job的输入文件所在目录
FileInputFormat.setInputPaths(job, new Path(args[0]));
//指定job的输出结果目录
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//将job中配置的相关参数,以及job所有的java类所在 的jar包,提交给yarn去运行
//job.submit();无结果返回,建议不使用它
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}
第一步处理的结果:
第二步:
在第一步的结果基础上,再次进行mr处理,该次的key可以进行--分割,让单词作为key即可,请看代码
package com.zsy.mr.inverindex;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class InverIndexStepTwo {
static class InverIndexStepTwoMapper extends Mapper<LongWritable, Text, Text, Text>{
Text k = new Text();
IntWritable val = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
String[] words = value.toString().split("--");
k.set(words[0]);
context.write(k ,new Text(words[1]));
}
}
static class InverIndexStepTwoReducer extends Reducer<Text, Text, Text, Text>{
@Override
protected void reduce(Text key, Iterable<Text> values,
Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {
String result = "" ;
List<String> resultList = new ArrayList<String>();
for (Text text : values) {
resultList.add(text.toString());
}
result = StringUtils.join(resultList, ";\t") ;
context.write(key, new Text(result));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
/*conf.set("mapreduce.framework.name", "yarn");
conf.set("yarn.resoucemanger.hostname", "hadoop01");*/
Job job = Job.getInstance(conf);
job.setJarByClass(InverIndexStepTwo.class);
//指定本业务job要使用的业务类
job.setMapperClass(InverIndexStepTwoMapper.class);
job.setReducerClass(InverIndexStepTwoReducer.class);
//指定mapper输出的k v类型 如果map的输出和reduce的输出一样,只需要设置输出即可
//job.setMapOutputKeyClass(Text.class);
//job.setMapOutputValueClass(IntWritable.class);
//指定最终输出kv类型(reduce输出类型)
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//指定job的输入文件所在目录
FileInputFormat.setInputPaths(job, new Path(args[0]));
//指定job的输出结果目录
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//将job中配置的相关参数,以及job所有的java类所在 的jar包,提交给yarn去运行
//job.submit();无结果返回,建议不使用它
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}
结果:
写了这几篇hadoop实例,感觉最难的就是key的设计,也就是算法,其他的逻辑很简单,hadoop的难点设计key应该是一个难点