- 倒排索引是文档检索系统中最常用的数据结构,被广泛的应用于全文搜索引擎。它主要用来存储某个单词(或词组),在一个文档或一组文档中的存储位置的映射,即提供了一种根据内容来查找文档的方式,由于不是根据文档来确定文档所包含的内容,而是进行了相反的操作,因而被称为倒排索引。
-
这里感觉有点问题。就是在一个mapper里,处理来自同一个文件的内容才可以。否则combiner处理不正确。网上查了资料,如是说:我们知道InputFormat决定着InputSplit,每个InputSplit会分配给一个单独的Mapper,因此InputFormat决定了具体的Map task数量。如果是这样子,就正确。但是如何知道到底有多少个map task?不清楚...
- 假设在inversed.files中有file1,file2,file3文件,其内容分别如下:
- file1:
- dog cat
- dog rabbit
- tiger mice
- goose chicken
- rabbit fox
- file2:
- tiger donkey
- lion fish
- duck wolf
- dog bird
- cat bear
- file3:
- snake pig
- lion
- cat
- elephant
- 则进行倒排索引之后,其结果为:
bear file2:1 ,
bird file2:1 ,
cat file2:1 ,file1:1 ,file3:1 ,
chicken file1:1 ,
dog file1:2 ,file2:1 ,duck file2:1 ,dungkey file2:1 ,elephant file3:1 ,fish file2:1 ,fox file1:1 ,goose file1:1 ,lion file2:1 ,file3:1 ,mice file1:1 ,pig file3:1 ,rabbit file1:2 ,snake file3:1 ,tiger file2:1 ,file1:1 ,wolf file2:1 ,- import java.io.IOException;
- import java.util.StringTokenizer;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.apache.hadoop.mapreduce.Reducer;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.input.FileSplit;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- public class InversedIndex {
- /**
- * 将输入文件拆分,
- * 将关键字和关键字所在的文件名作为map的key输出,
- * 该组合的频率作为value输出
- * */
- public static class InversedIndexMapper extends Mapper<Object, Text, Text, Text> {
- private Text outKey = new Text();
- private Text outVal = new Text();
- @Override
- public void map (Object key,Text value,Context context) {
- StringTokenizer tokens = new StringTokenizer(value.toString());
- FileSplit split = (FileSplit) context.getInputSplit();
- while(tokens.hasMoreTokens()) {
- String token = tokens.nextToken();
- try {
- outKey.set(token + ":" + split.getPath());
- outVal.set("1");
- context.write(outKey, outVal);
- } catch (IOException e) {
- e.printStackTrace();
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
- }
- }
- }
- /**
- * map的输出进入到combiner阶段,此时来自同一个文件的相同关键字进行一次reduce处理,
- * 将输入的key拆分成关键字和文件名,然后关键字作为输出key,
- * 将文件名与词频拼接,作为输出value,
- * 这样就形成了一个关键字,在某一文件中出现的频率的 key--value 对
- * */
- public static class InversedIndexCombiner extends Reducer<Text, Text, Text, Text> {
- private Text outKey = new Text();
- private Text outVal = new Text();
- @Override
- public void reduce(Text key,Iterable<Text> values,Context context) {
- String[] keys = key.toString().split(":");
- int sum = 0;
- for(Text val : values) {
- sum += Integer.parseInt(val.toString());
- }
- try {
- outKey.set(keys[0]);
- int index = keys[keys.length-1].lastIndexOf('/');
- outVal.set(keys[keys.length-1].substring(index+1) + ":" + sum);
- context.write(outKey, outVal);
- } catch (IOException e) {
- e.printStackTrace();
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
- }
- }
- /**
- * 将combiner后的key value对进行reduce,
- * 由于combiner之后,一个关键字可能对应了多个value,故需要将这些value进行合并输出
- * */
- public static class InversedIndexReducer extends Reducer<Text, Text, Text, Text> {
- @Override
- public void reduce (Text key,Iterable<Text> values,Context context) {
- StringBuffer sb = new StringBuffer();
- for(Text text : values) {
- sb.append(text.toString() + " ,");
- }
- try {
- context.write(key, new Text(sb.toString()));
- } catch (IOException e) {
- e.printStackTrace();
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
- }
- }
- public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
- Configuration conf = new Configuration();
- Job job = new Job(conf,"index inversed");
- job.setJarByClass(InversedIndex.class);
- job.setMapperClass(InversedIndexMapper.class);
- job.setCombinerClass(InversedIndexCombiner.class);
- job.setReducerClass(InversedIndexReducer.class);
- job.setMapOutputKeyClass(Text.class);
- job.setMapOutputValueClass(Text.class);
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(Text.class);
- FileInputFormat.addInputPath(job, new Path("inversed.files"));
- FileOutputFormat.setOutputPath(job, new Path("inversed.result"));
- System.exit(job.waitForCompletion(true)?0:1);
- }
- }