====
案例:倒排索引建立
需求分析
需求:有大量的文本(文档、网页),需要建立搜索索引
思路分析:
首选将文档的内容全部读取出来,加上文档的名字作为key,文档的value为1,组织成这样的一种形式的数据
map端数据输出
hello-a.txt 1
hello-a.txt 1
hello-a.txt 1
reduce端数据输出
hello-a.txt 3
k1: LongWritable; v1: Text
k2: Text; v2: IntWritable
k3: Text; v3: IntWritable
文件c.txt里的内容:
hello jerry
hello tom
k2 v2
hello-c.txt 1
jerry-c.txt 1
tom-c.txt 1
hello-c.txt [1,1] ==> hello-c.txt 2
代码实现
package cn.itcast.demo2;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
public class IndexMapper extends Mapper<LongWritable,Text,Text,IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//首先获取文件的切片,得到了切片之后,获取对应的block块,然后就可以知道文件的名称了
//将我们的inputSplit强转成FileSplit
FileSplit fileSplit = (FileSplit) context.getInputSplit();
//获取文件的名称
String name = fileSplit.getPath().getName();
String[] split = value.toString().split(" ");
for (String s : split) {
context.write(new Text(s+"-"+name),new IntWritable(1));
}
}
}
package cn.itcast.demo2;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.awt.event.KeyEvent;
import java.io.IOException;
public class IndexReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for (IntWritable value : values) {
count += value.get();
}
//将我们最终结果统计输出
context.write(key,new IntWritable(count));
}
}
//main方法省略,可以参照下面这个main来写
public class IndexCreate extends Configured implements Tool {
public static void main(String[] args) throws Exception {
ToolRunner.run(new Configuration(),new IndexCreate(),args);
}
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(super.getConf(), IndexCreate.class.getSimpleName());
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\倒排索引\\input"));
job.setMapperClass(IndexCreateMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(IndexCreateReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\倒排索引\\outindex"));
boolean bool = job.waitForCompletion(true);
return bool?0:1;
}
|
====