hadoop MapReduce倒排索引编程

36 篇文章 1 订阅
12 篇文章 0 订阅

统计每个单词在每个文件中出现的次数

输入数据:

file1.txt

Spark is so powerful

file2.txt

Spark is the most exciting thing happening in big data today

file3.txt

Hello Spark Hello again Spark

输出格式:

代码实现:

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import java.io.IOException;
import java.util.StringTokenizer;

/**
 * FileName: InvertedIndex
 * Author:   hadoop
 * Email:    3165845957@qq.com
 * Date:     18-10-9 下午9:04
 * Description:
 */
public class InvertedIndex {
    //使用Mapper将数据文件中的数据本身作为Mapper输出的key直接输出
    public static class InvertedIndexMapper extends Mapper<LongWritable, Text, Text, Text> {
        final Text number = new Text("1"); //标记每一个key
        private  String fileName; //存取每一个文件的名称
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            if (line.trim().length() >0){
                StringTokenizer stringTokenizer = new StringTokenizer(line); //对输入的句子进行切分单词
                while (stringTokenizer.hasMoreTokens()){
                    String keyForCombiner = stringTokenizer.nextToken()+":"+fileName; //将key设置为word:filename
                    context.write(new Text(keyForCombiner),number);

                }
            }
        }

        /**
         * 初始化,获得当前输入文件的名称
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
           FileSplit inputSplit = (FileSplit)context.getInputSplit();
           fileName = inputSplit.getPath().getName();
        }
    }


    //现在本地进行归并,在进行Shuffle
    public static class InvertedIndexCombiner extends Reducer<Text, Text, Text,Text> {
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (Text item : values){
                sum += Integer.valueOf(item.toString());
            }
            String[] keyArray = key.toString().split(":");
            context.write(new Text(keyArray[0]),new Text(keyArray[1]+":"+sum));
        }
    }
    public static class InvertedIndexReducer extends Reducer<Text, Text, Text,Text> {
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            StringBuffer result = new StringBuffer();
            for (Text item: values){
                result.append(item+";");
            }
            context.write(key,new Text(result.toString().substring(0,result.toString().length()-1)));
        }
    }


    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();//设置MapReduce的配置
        Path outputPath = new Path(args[1]);
        FileSystem hdfs = outputPath.getFileSystem(conf);
        // 判断路径是否存在,如果存在,则删除
        if (hdfs.isDirectory(outputPath)){
            hdfs.delete(outputPath,true);
        }
        String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
        if(otherArgs.length < 2){
            System.out.println("Usage: InvertedIndex <in> [<in>...] <out>");
            System.exit(2);
        }

        Job job = Job.getInstance(conf,"InvertedIndex");
        //设置主类
        job.setJarByClass(InvertedIndex.class);
        job.setCombinerClass(InvertedIndexCombiner.class);
        //设置处理Mapper
        job.setMapperClass(InvertedIndexMapper.class);
        //设置map输出的key类型
        job.setMapOutputKeyClass(Text.class);
        //设置map输出的value类型
        job.setMapOutputValueClass(Text.class);
        //设置Reducer
        job.setReducerClass(InvertedIndexReducer.class);
        //设置Reducer输出的Key类型
        job.setOutputKeyClass(Text.class);
        //设置Reducer输出的value类型
        job.setOutputValueClass(Text.class);
        //设定输入路径
        for (int i = 0; i < otherArgs.length-1;++i){
            FileInputFormat.addInputPath(job,new Path(otherArgs[i]));
        }
        //设置输出路径
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length-1]));
        System.exit(job.waitForCompletion(true)?0:1);
    }
}

 

 

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值