需求:
1、再不同的文件中 有着各种单词,每行单词之间以空格间隔
2、统计所有文件,以每行为
单词(空格)文件1名-->单词出现的次数(空格)文件2名-->单词出现的次数(空格)文件3名-->单词出现的次数 的格式产生输出
思路:
1、先以 单词--文件名(空格)单词出现次数 的格式输出。
2、然后再将相同的单词进行整合,按要求格式输出。
(本地)
测试数据:
a.txt:
tom jerry bpf
good nice bpf
b.txt:
hello calvin
nice job bro
c.txt:
bpf you are right
so cool calvin
第一步代码:
package com.bpf.mr.inverindex;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class InverIndexStepone {
static class InverIndexSteponeMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
Text k = new Text();
IntWritable v = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] split = line.split(" ");
FileSplit inputSplit = (FileSplit) context.getInputSplit();
String name = inputSplit.getPath().getName();
for (String word : split) {
k.set(word + "--" + name);
context.write(k, v);
}
}
}
static class InverIndexSteponeReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for (IntWritable value : values) {
count += value.get();
}
context.write(key, new IntWritable(count));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(InverIndexStepone.class);
job.setMapperClass(InverIndexSteponeMapper.class);
job.setReducerClass(InverIndexSteponeReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path("D:\\测试数据\\输入"));
FileOutputFormat.setOutputPath(job, new Path("D:\\测试数据\\输出"));
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}
输出结果:
第二步代码:
package com.bpf.mr.inverindex;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class InverIndexSteptwo {
static class InverIndexSteptwoMapper extends Mapper<LongWritable, Text, Text, Text>{
Text k = new Text();
Text v = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] split1 = line.split("--");
String[] split2 = split1[1].split("\t");
k.set(split1[0]);
v.set(split2[0] + "-->" + split2[1] + " ");
context.write(k, v);
}
}
static class InverIndexSteptwoReducer extends Reducer<Text, Text, Text, Text>{
Text v = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
String v1 = "";
for (Text value : values) {
v1 += value.toString();
}
v.set(v1);
context.write(key, v);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(InverIndexSteptwo.class);
job.setMapperClass(InverIndexSteptwoMapper.class);
job.setReducerClass(InverIndexSteptwoReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job, new Path("D:\\测试数据\\输出"));
FileOutputFormat.setOutputPath(job, new Path("D:\\测试数据\\再输出"));
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}
输出结果: