1 待统计的文本数据
2 源码
package InverseIndex
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.IntWritable
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.mapreduce.Mapper
import org.apache.hadoop.mapreduce.Reducer
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.hadoop.mapreduce.lib.input.FileSplit
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import java.io.IOException
public class InverseIndexStepOne {
static class InverseIndexStepOneMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
Text k = new Text()
IntWritable v = new IntWritable(1)
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString()
String[] words = line.split(" ")
FileSplit inputSplit = (FileSplit) context.getInputSplit()
String fileName = inputSplit.getPath().getName()
for (String word : words) {
k.set(word + "--" + fileName)
context.write(k, v)
}
}
}
static class InverseIndexStepOneReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0
for (IntWritable value : values) {
count += value.get()
}
context.write(key, new IntWritable(count))
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration()
Job job = Job.getInstance(conf)
job.setJarByClass(InverseIndexStepOne.class)
job.setOutputKeyClass(Text.class)
job.setOutputValueClass(IntWritable.class)
FileInputFormat.setInputPaths(job,new Path("h:/inverse/input"))
FileOutputFormat.setOutputPath(job,new Path("h:/inverse/output_step_one"))
job.setMapperClass(InverseIndexStepOneMapper.class)
job.setReducerClass(InverseIndexStepOneReducer.class)
job.waitForCompletion(true)
}
}
2.1 本地运行结果
3 调整输出
3.1 源码
package InverseIndex
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.IntWritable
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.NullWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.mapreduce.Mapper
import org.apache.hadoop.mapreduce.Reducer
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import java.io.IOException
public class InverseIndexStepTwo {
static class InverseIndexStepTwoMapper extends Mapper<LongWritable, Text, Text, Text> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString()
String[] word_file = line.split("--")
context.write(new Text(word_file[0]), new Text(word_file[1]))
}
}
static class InverseIndexStepTwoReducer extends Reducer<Text, Text, Text, Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
StringBuffer sb = new StringBuffer()
for (Text value : values) {
sb.append(value.toString()+"\t")
}
context.write(key,new Text(sb.toString()))
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration()
Job job = Job.getInstance(conf)
job.setJarByClass(InverseIndexStepTwo.class)
job.setOutputKeyClass(Text.class)
job.setOutputValueClass(Text.class)
FileInputFormat.setInputPaths(job,new Path("h:/inverse/output_step_one"))
FileOutputFormat.setOutputPath(job,new Path("h:/inverse/output_step_two"))
job.setMapperClass(InverseIndexStepTwoMapper.class)
job.setReducerClass(InverseIndexStepTwoReducer.class)
job.waitForCompletion(true)
}
}
3.2 本地运行结果