hadoop InvertedIndex代码详解

最新推荐文章于 2022-12-14 10:16:23 发布

小刊同学

最新推荐文章于 2022-12-14 10:16:23 发布

阅读量374

点赞数

分类专栏： hadoop 文章标签： hadoop

本文链接：https://blog.csdn.net/weixin_44285316/article/details/106843273

版权

hadoop 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class WordCount {

  public static class TokenizerMapper extends    //自定义mapper类(分词器mapper)
    Mapper<Object, Text, Text, IntWritable> {    //继承mapper类

    private final static IntWritable one = new IntWritable(1);    //对整数1的IntWritable封装(IntWritable是 Hadoop 中实现的用于封装 Java 数据类型的类,它的原型是public IntWritable(int value)和public IntWritable()两种。所以new IntWritable(1)是创建了这个类的一个对象，而数值1这是参数。在Hadoop中它相当于java中Integer整型变量，为这个变量赋值为1)
    private Text word = new Text();    //创建text对象(一般认为Text类和String类是等价的，但二者之间其实存在着不小差别:cnblogs.com/huiAlex/p/8182586.html)
      
    public void map(Object key, Text value, Context context)    //定义map函数(超类：所有类的祖先，text类：类似于String，Context类：Context是一个场景，描述的是一个应用程序环境的信息，即上下文，代表与操作系统的交互的一种过程。)
        throws IOException, InterruptedException {    //抛出输入输出异常、中断异常
      		StringTokenizer itr = new StringTokenizer(value.toString());    //创建StringTokenizer对象(属于 java.util 包，用于分隔字符串)
      		while (itr.hasMoreTokens()) {    //转换类型，hasMoreTokens() 方法是用来测试是否有此标记生成器的字符串可用更多的标记:https://www.yiibai.com/java/util/stringtokenizer_hasmoreelements.html
        		word.set(itr.nextToken());    //nextToken()：从第一个开始，被分隔后的字符串列表的下一个
        		context.write(word, one);    //记录(词,1)
      		}
    	}
	}

  public static class IntSumReducer extends    //自定义Reducer类
  	Reducer<Text, IntWritable, Text, IntWritable> {    //继承Reducer类
    private IntWritable result = new IntWritable();    //创建IntWritable类

    public void reduce(Text key, Iterable<IntWritable> values, Context context)    //value为IntWritable类
        throws IOException, InterruptedException {    //抛出输入输出异常、中断异常
      		int sum = 0;    //累加记录词频
      		for (IntWritable val : values) {    //对每一个mapper输出结果
        		sum += val.get();    //词频累加
      		}
     		result.set(sum);    //转化类型
      		context.write(key, result);    //写入结果
    	}
  	}

  public static void main(String[] args) throws Exception {    //主函数入口
    Configuration conf = new Configuration();    //通过Configuration可以实现在多个mapper和多个reducer任务之间共享信息。
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();    //GenericOptionsParser是hadoop框架中解析命令行参数的基本类。它能够辨别一些标准的命令行参数，能够使应用程序轻易地指定namenode，jobtracker，以及其他额外的配置资源。
                                                                                     //getRemainingArgs是获取除了hadoop基本命令的其它参数，输入路径：otherArgs[0]和输出路径：otherArgs[1]两个参数
    if (otherArgs.length != 2) {    //获取失败
    	System.err.println("Usage: wordcount <in> <out>");    //抛出错误
    	System.exit(2);    //程序退出
    }
    Job job = new Job(conf, "word count");    //创建工作任务
    job.setJarByClass(WordCount.class);    //
    job.setMapperClass(TokenizerMapper.class);    //指定Mapper的格式为TokenizerMapper
    job.setCombinerClass(IntSumReducer.class);    //同理
    job.setReducerClass(IntSumReducer.class);    //同理
    //未指定输入格式，默认为TextInputFormat类，将当前文本的字符偏移量（相对于文本文件首地址的偏移量）作为key，该行的内容作为value。
    //TextTextInputFormat类提供了默认的LineRecordReader，以读入一个文本行数据记录。Mapper的输入数据格式应指定为<LongWritable,Text>
    job.setOutputKeyClass(Text.class);    //指定MapReduce输出中key的输出格式为Text
    job.setOutputValueClass(IntWritable.class);    //指定MapReduce输出中value的输出格式为IntWritable
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));    //为job作业添加输入路径
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));    //为job作业设置输出路径
    System.exit(job.waitForCompletion(true) ? 0 : 1);    //job作业执行成功：System.exit(0);是正常退出程序.job作业执行失败：System.exit(1);非正常退出程序
  }
}

参考文章：

https://www.cnblogs.com/SeaSky0606/p/4820786.html

小刊同学

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
hadoop InvertedIndex代码详解

import java.io.IOException;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;
复制链接

扫一扫