倒排索引

概念

倒排索引:倒排索引是文档检索系统中最常用到的数据结果,应用于搜索引擎,根据内容来查找文档的一种方式。进行相反的操作,因称为倒排索引;
简单理解就是根据单词,返回它在哪个文件中出现过,而且频率是多少的结果

设计思路

Map过程

在Map端 把需要处理的文档上传到hdfs时,输入的文件被处理,得到文件中每一行的偏移量和这一行内容的键值对<偏移量,内容>做为map的输入。得到索引中需要的信息:单词,文档URI 和词频。
key:单词和URI
value:出现同样单词的次数。

combiner 过程

经过map方法处理后,Combine过程将key值相同的value值累加,得到一个单词在文档中的词频。为了将相同的key交给相对应的reduce需要自定义数据类型
注意
在combiner过程中,因为map输出的类型跟reduce输入的类型必须一致,所以combiner的输入与输出类型也必须一样

reduce 端

接收combiner输入的的’[key,value]’数据,Reduce过程只需要将相同的key值的value值组合成倒排引索文件的格式即可,其余的事情直接交给MapReduce框架进行处理。

示例代码


Combiner

package com.hao.bigdata.hadoop.mapreduce.InvertedIndex;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;

public class IndexCombiner extends
        Reducer<URLWritable, IntWritable, URLWritable, IntWritable> {
        private URLWritable combinerOutputKey = new URLWritable();
    private IntWritable combinerOutputValue = new IntWritable();

    @Override
    public void reduce(URLWritable key, Iterable<IntWritable> values,
            Context context) throws IOException, InterruptedException {
        // TODO Auto-generated method stub
        combinerOutputKey.setKey(key.getKey());
        int sum = 0;
        for (IntWritable value : values) {
            sum += value.get();
        }
        combinerOutputValue.set(sum);
        context.write(key, combinerOutputValue);

    }
}

自定义类型

package com.hao.bigdata.hadoop.mapreduce.InvertedIndex;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class URLWritable implements WritableComparable<URLWritable> {

    private String key;
    private String url;

    public URLWritable() {
    }

    public String getKey() {
        return key;
    }

    public void setKey(String key) {
        this.key = key;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public  void set(String key, String url) {
        this.key = key;
        this.url = url;
    }

    public URLWritable(String key, String url) {
        this.set(key, url);
    }

    public void write(DataOutput out) throws IOException {
        // TODO Auto-generated method stub
        out.writeUTF(key);
        out.writeUTF(url);

    }

    public void readFields(DataInput in) throws IOException {
        // TODO Auto-generated method stub
        this.key = in.readUTF();
        this.url = in.readUTF();
    }

    public int compareTo(URLWritable o) {
        // TODO Auto-generated method stub
        int comp = this.key.compareTo(o.getKey());
        if (0 != comp) {
            return comp;
        }
        return Integer.valueOf(getUrl()).compareTo(
                Integer.valueOf(o.getUrl()));
    }

    @Override
    public int hashCode() {
        final int prime = 31;
        int result = 1;
        result = prime * result + ((key == null) ? 0 : key.hashCode());
        result = prime * result + ((url == null) ? 0 : url.hashCode());
        return result;
    }

    @Override
    public boolean equals(Object obj) {
        if (this == obj)
            return true;
        if (obj == null)
            return false;
        if (getClass() != obj.getClass())
            return false;
        URLWritable other = (URLWritable) obj;
        if (key == null) {
            if (other.key != null)
                return false;
        } else if (!key.equals(other.key))
            return false;
        if (url == null) {
            if (other.url != null)
                return false;
        } else if (!url.equals(other.url))
            return false;
        return true;
    }

    @Override
    public String toString() {
        return  key + "," +url;
    }

}

MapReduce

package com.hao.bigdata.hadoop.mapreduce.InvertedIndex;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


public class InvertedMapReduce extends Configured implements Tool {

    // maper classs
    /***
     * @author hao public class Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
     */
    // TODO
    public static class IndexMapper extends
            Mapper<LongWritable, Text, URLWritable, IntWritable> { // extends-mapper-jilei

        // set,map,output,value
        private IntWritable mapOutputvalue = new IntWritable(1);
        private URLWritable mapOutputKey = new URLWritable();

        @Override
        public void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            String lineValue = value.toString();
            String[] values = lineValue.split("\\$\\$");
            String url = values[0];
            String title = values[1];
            String content = values[2];
            // split title
            String[] SplitTitle = title.split(" ");
            for (String SplitTitles : SplitTitle) {
                mapOutputKey.set(SplitTitles, url);
                context.write(mapOutputKey, mapOutputvalue);
            }

            // split title
            String[] SplitContent = content.split(" ");
            for (String SplitContents : SplitContent) {
                mapOutputKey.set(SplitContents, url);
                context.write(mapOutputKey, mapOutputvalue);
            }

        }
    }

    // reducer class
    /**
     * * @author hao public class Reducer<KEYIN,VALUEIN,KEYOUT,VALUEOUT>
     */
    // TODO
    public static class IndexReducer extends
            Reducer<URLWritable, IntWritable, URLWritable, IntWritable> {
        private URLWritable reduceOutPutKey = new URLWritable();
        private IntWritable reduceOutPutValue = new IntWritable();

        @Override
        public void reduce(URLWritable key, Iterable<IntWritable> values,
                Context context) throws IOException, InterruptedException {
            // TODO Auto-generated method stub
            // set combinerOutput Key
            reduceOutPutKey.setKey(key.getKey());
            int sum = 0;
            for (IntWritable value : values) {
                sum += value.get();
            }
            reduceOutPutValue.set(sum);
            context.write(reduceOutPutKey, reduceOutPutValue);

        }
    }

    // driver
    public int run(String args[]) throws Exception {
        // step 1: get Configuration
        Configuration configuration = super.getConf();

        // step 2: creat Job chuanlian input-> map->reduce->output
        Job job = Job.getInstance(configuration, this.getClass()
                .getSimpleName());
        job.setJarByClass(this.getClass()); // jar bao

        /**
         * step 3:job input ->map ->reduce ->output
         */
        // step 3.1:input
        Path inpath = new Path(args[0]); // fengzhuang lujing
        FileInputFormat.addInputPath(job, inpath);
        // step 3.2:mapper
        job.setMapperClass(IndexMapper.class);
        job.setMapOutputKeyClass(URLWritable.class); // zhiding,map,shuchu<key,value>leixing
        job.setMapOutputValueClass(IntWritable.class);
        // =============shuffle========================
        // 1.partitioner
        // job.setPartitionerClass(cls);
        // 2.sort
        // job.setSortComparatorClass(cls);
        // 3.combin
        job.setCombinerClass(IndexCombiner.class);
        // 4.compress
        // set by configuration
        // 5.group
        // job.setGroupingComparatorClass(cls);
        // ==============shuffle=======================

        // step 3.3:reducer
        job.setReducerClass(IndexReducer.class);// zhiding,reduce,shuchu<keyK,value>,leixing
        // TODO
        job.setOutputKeyClass(URLWritable.class);
        job.setOutputValueClass(IntWritable.class);
        /*
         * //set reduce num job.setNumReduceTasks(0);
         */
        // step 3.4:output
        Path outpath = new Path(args[1]);
        FileOutputFormat.setOutputPath(job, outpath);

        boolean isSuccess = job.waitForCompletion(true);

        return isSuccess ? 0 : 1;
    }

    // main
    public static void main(String[] args) throws Exception {
        /*
         * args = new String[] {
         * "hdfs://bigdata00.hadoop-hao.com:8020/data/inputFiles/input02",
         * "hdfs://bigdata00.hadoop-hao.com:8020/data/outputFiles/output04" };
         */
        // create configuration
        Configuration configuration = new Configuration();
        // run job
        int status = ToolRunner.run(configuration, new InvertedMapReduce(),
                args);
        // exit program
        System.exit(status);
    }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值