MapReduce之三—搜索引擎-倒排索引实现

1.原始数据

a.txt

hello tom
hello jerry
hello kitty
hello world
hello tom

b.txt

hello jerry
hello tom
hello world

过程模拟

Map阶段
<0,"hello tom">
....


context.write("hello->a.txt",1);
context.write("hello->a.txt",1);
context.write("hello->a.txt",1);
context.write("hello->a.txt",1);
context.write("hello->a.txt",1);

context.write("hello->b.txt",1);
context.write("hello->b.txt",1);
context.write("hello->b.txt",1);
--------------------------------------------------------
combiner阶段
<"hello->a.txt",1>
<"hello->a.txt",1>
<"hello->a.txt",1>
<"hello->a.txt",1>
<"hello->a.txt",1>

<"hello->b.txt",1>
<"hello->b.txt",1>
<"hello->b.txt",1>

context.write("hello","a.txt->5");
context.write("hello","b.txt->3");
--------------------------------------------------------
Reducer阶段
<"hello",{"a.txt->5","b.txt->3"}>


context.write("hello","a.txt->5 b.txt->3");
-------------------------------------------------------
hello   "a.txt->5 b.txt->3"
tom     "a.txt->2 b.txt->1"
kitty   "a.txt->1"
.......

2.实现代码

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class ReverseSort {

    public static void main(String[] args) throws Exception {
        Job job = Job.getInstance(new Configuration());
        job.setJobName("reverseSort");
        job.setJarByClass(ReverseSort.class);

        job.setMapperClass(RSMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        FileInputFormat.setInputPaths(job, new Path(args[0]));

        job.setReducerClass(RSReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        job.setCombinerClass(RSCombiner.class);
        //设置ruducer数量,即分区数,默认为1
        job.setNumReduceTasks(Integer.parseInt(args[2]));

        job.waitForCompletion(true);

    }

    public static class RSMapper extends Mapper<LongWritable, Text, Text, Text>{
        private Text k2=new Text();
        private Text v2=new Text();
        @Override
        protected void map(LongWritable key, Text value,
                Mapper<LongWritable, Text, Text, Text>.Context context)
                throws IOException, InterruptedException {
            String line=value.toString();
            String[]words = line.split(" ");
            FileSplit fileSplit = (FileSplit)context.getInputSplit();
            Path path = fileSplit.getPath();
            String name=path.getName();
            for (String word : words) {
                k2.set(word+"->"+name);
                v2.set("1");
                context.write(k2, v2); //  ("hello->b.txt",1)
            }
        }
    }

    public static class RSCombiner extends Reducer<Text, Text, Text, Text>{
        private Text k=new Text();
        private Text v=new Text();
        @Override
        protected void reduce(Text k2, Iterable<Text> v2s,
                Reducer<Text, Text, Text, Text>.Context context)
                throws IOException, InterruptedException {
            String[] fields=k2.toString().split("->");
            k.set(fields[0]);
            long sum=0;
            for (Text v : v2s) {
                sum+=Long.parseLong(v.toString());
            }
            v.set(fields[1]+"->"+sum);
            context.write(k, v); //  ("hello","a.txt->5")
        }
    }
    public static class RSReducer extends Reducer<Text, Text, Text, Text>{
        private Text k=new Text();
        private Text v=new Text();
        @Override
        protected void reduce(Text key, Iterable<Text> v2s,
                Reducer<Text, Text, Text, Text>.Context context)
                throws IOException, InterruptedException {
            k.set(key.toString());
            StringBuilder sb = new StringBuilder("");
            for (Text text : v2s) {
                sb.append(text.toString()+" ");
            }
            v.set(sb.toString());
            context.write(k, v);
        }
    }
}

3.倒排索引输出结果
执行:

hadoop jar rsort.jar cn.zx.hadoop.mr.Rsort.ReverseSort /test/rs /test/rsout 3

结果:
这里写图片描述

这里写图片描述

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

IT布道者

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值