hadoop MapReduce倒排索引编程

最新推荐文章于 2021-12-21 20:16:02 发布

zerone-f

最新推荐文章于 2021-12-21 20:16:02 发布

阅读量489

点赞数

分类专栏： hadoop 大数据 MapReduce 文章标签： hadoop MapReduce 倒排索引

本文链接：https://blog.csdn.net/someby/article/details/82990991

版权

大数据同时被 3 个专栏收录

196 篇文章 4 订阅

订阅专栏

hadoop

36 篇文章 1 订阅

订阅专栏

MapReduce

12 篇文章 0 订阅

订阅专栏

统计每个单词在每个文件中出现的次数

输入数据：

file1.txt

Spark is so powerful

file2.txt

Spark is the most exciting thing happening in big data today

file3.txt

Hello Spark Hello again Spark

输出格式：

代码实现：

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import java.io.IOException;
import java.util.StringTokenizer;

/**
 * FileName: InvertedIndex
 * Author:   hadoop
 * Email:    3165845957@qq.com
 * Date:     18-10-9 下午9:04
 * Description:
 */
public class InvertedIndex {
    //使用Mapper将数据文件中的数据本身作为Mapper输出的key直接输出
    public static class InvertedIndexMapper extends Mapper<LongWritable, Text, Text, Text> {
        final Text number = new Text("1"); //标记每一个key
        private  String fileName; //存取每一个文件的名称
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            if (line.trim().length() >0){
                StringTokenizer stringTokenizer = new StringTokenizer(line); //对输入的句子进行切分单词
                while (stringTokenizer.hasMoreTokens()){
                    String keyForCombiner = stringTokenizer.nextToken()+":"+fileName; //将key设置为word：filename
                    context.write(new Text(keyForCombiner),number);

                }
            }
        }

        /**
         * 初始化，获得当前输入文件的名称
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
           FileSplit inputSplit = (FileSplit)context.getInputSplit();
           fileName = inputSplit.getPath().getName();
        }
    }


    //现在本地进行归并，在进行Shuffle
    public static class InvertedIndexCombiner extends Reducer<Text, Text, Text,Text> {
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (Text item : values){
                sum += Integer.valueOf(item.toString());
            }
            String[] keyArray = key.toString().split(":");
            context.write(new Text(keyArray[0]),new Text(keyArray[1]+":"+sum));
        }
    }
    public static class InvertedIndexReducer extends Reducer<Text, Text, Text,Text> {
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            StringBuffer result = new StringBuffer();
            for (Text item: values){
                result.append(item+";");
            }
            context.write(key,new Text(result.toString().substring(0,result.toString().length()-1)));
        }
    }


    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();//设置MapReduce的配置
        Path outputPath = new Path(args[1]);
        FileSystem hdfs = outputPath.getFileSystem(conf);
        // 判断路径是否存在，如果存在，则删除
        if (hdfs.isDirectory(outputPath)){
            hdfs.delete(outputPath,true);
        }
        String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
        if(otherArgs.length < 2){
            System.out.println("Usage: InvertedIndex <in> [<in>...] <out>");
            System.exit(2);
        }

        Job job = Job.getInstance(conf,"InvertedIndex");
        //设置主类
        job.setJarByClass(InvertedIndex.class);
        job.setCombinerClass(InvertedIndexCombiner.class);
        //设置处理Mapper
        job.setMapperClass(InvertedIndexMapper.class);
        //设置map输出的key类型
        job.setMapOutputKeyClass(Text.class);
        //设置map输出的value类型
        job.setMapOutputValueClass(Text.class);
        //设置Reducer
        job.setReducerClass(InvertedIndexReducer.class);
        //设置Reducer输出的Key类型
        job.setOutputKeyClass(Text.class);
        //设置Reducer输出的value类型
        job.setOutputValueClass(Text.class);
        //设定输入路径
        for (int i = 0; i < otherArgs.length-1;++i){
            FileInputFormat.addInputPath(job,new Path(otherArgs[i]));
        }
        //设置输出路径
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length-1]));
        System.exit(job.waitForCompletion(true)?0:1);
    }
}