MapReduce实战练习三：倒排索引

最新推荐文章于 2020-05-14 22:10:01 发布

BestbpF

最新推荐文章于 2020-05-14 22:10:01 发布

阅读量446

点赞数

分类专栏： Hadoop 文章标签： mapreduce

本文链接：https://blog.csdn.net/qq_34764487/article/details/78059615

版权

Hadoop 专栏收录该内容

13 篇文章 1 订阅

订阅专栏

需求：

1、再不同的文件中有着各种单词，每行单词之间以空格间隔

2、统计所有文件，以每行为

单词（空格）文件1名-->单词出现的次数（空格）文件2名-->单词出现的次数（空格）文件3名-->单词出现的次数的格式产生输出

思路：

1、先以单词--文件名（空格）单词出现次数的格式输出。

2、然后再将相同的单词进行整合，按要求格式输出。

（本地）

测试数据：

a.txt：

tom jerry bpf
good nice bpf

b.txt：

hello calvin
nice job bro

c.txt：

bpf you are right
so cool calvin

第一步代码：

package com.bpf.mr.inverindex;

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class InverIndexStepone {

    
    static class InverIndexSteponeMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
        
        Text k = new Text();
        IntWritable v = new IntWritable(1);
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            String[] split = line.split(" ");
            FileSplit inputSplit = (FileSplit) context.getInputSplit();
            String name = inputSplit.getPath().getName();
            for (String word : split) {
                k.set(word + "--" + name);
                context.write(k, v);
            }
        }
               
    }
    
    static class InverIndexSteponeReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int count = 0;
            for (IntWritable value : values) {
                count += value.get();
            }
            
            context.write(key, new IntWritable(count));
        }
    }
    
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        
        job.setJarByClass(InverIndexStepone.class);
        
        job.setMapperClass(InverIndexSteponeMapper.class);
        job.setReducerClass(InverIndexSteponeReducer.class);
        
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        
        FileInputFormat.setInputPaths(job, new Path("D:\\测试数据\\输入"));
        FileOutputFormat.setOutputPath(job, new Path("D:\\测试数据\\输出"));
                
        boolean res = job.waitForCompletion(true);
        System.exit(res?0:1);
    }
}

输出结果：

第二步代码：

package com.bpf.mr.inverindex;

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class InverIndexSteptwo {

    
    static class InverIndexSteptwoMapper extends Mapper<LongWritable, Text, Text, Text>{
        
        Text k = new Text();
        Text v = new Text();
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            String[] split1 = line.split("--");
            String[] split2 = split1[1].split("\t");
            k.set(split1[0]);
            v.set(split2[0] + "-->" + split2[1] + " ");
            context.write(k, v);
            
        }
    }
    
    static class InverIndexSteptwoReducer extends Reducer<Text, Text, Text, Text>{
        Text v = new Text();
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            String v1 = "";
            for (Text value : values) {
                v1 += value.toString();
            }
            
            v.set(v1);
            context.write(key, v);
        }
    }
    
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        
        job.setJarByClass(InverIndexSteptwo.class);
        
        job.setMapperClass(InverIndexSteptwoMapper.class);
        job.setReducerClass(InverIndexSteptwoReducer.class);
        
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        
        FileInputFormat.setInputPaths(job, new Path("D:\\测试数据\\输出"));
        FileOutputFormat.setOutputPath(job, new Path("D:\\测试数据\\再输出"));
                
        boolean res = job.waitForCompletion(true);
        System.exit(res?0:1);
    }
}

输出结果：

BestbpF

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
MapReduce实战练习三：倒排索引

需求：1、再不同的文件中有着各种单词，每行单词之间以空格间隔2、统计所有文件，以每行为单词（空格）文件1名-->单词出现的次数（空格）文件2名-->单词出现的次数（空格）文件3名-->单词出现的次数的格式产生输出思路：1、先以单词--文件名（空格）单词出现次数的格式输出。2、然后再将相同的单词进行整合，按要求格式输出。（本地）测试数据：
复制链接

扫一扫