计算不同引用次数的专利数目

最新推荐文章于 2024-07-12 10:57:35 发布

程杉耘朵

最新推荐文章于 2024-07-12 10:57:35 发布

阅读量2.4k

点赞数 1

分类专栏： hadoop

本文链接：https://blog.csdn.net/chsyd1028/article/details/72152414

版权

hadoop 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

在上一篇中，对专利的引用次数进行了统计，有了如下的数据。

这一次，要对这个数据进行统计，来算出被引用过1次的专利的个数、被引用过2次专利的个数、3次的、4次的....

代码：

package org.apache.hadoop.pr;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.KeyValueTextInputFormat;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


public class CitationHistogram extends Configured implementsTool{
   
    //Mapper类
    publicstatic class MapClass extends MapReduceBase implementsMapper{

       privatefinal static IntWritable uno = newIntWritable(1);  //uno为1，final型
       privateIntWritable citationCount = new IntWritable(); //实例化一个citationCount用来存传进来的value
      
       public voidmap(Text key, Text value,
                     OutputCollector output,
                      Reporterreporter)
             throwsIOException {
         citationCount.set(Integer.parseInt(value.toString())); //把value值赋给citationCount
         output.collect(citationCount, uno); //输出类型的键为citationCount即被引用的次数，值为1
       }
         
    }
   
    //mapper和reducer之间会有一个partitioner，重定向mapper的输出，根据key将mapper的结果输出给不同的reducer
   //最后进reducer的时候形式就会是，而这个程序里的value都是1

   //Reducer类
    publicstatic class Reduce extends MapReduceBase implements Reducer{

       public voidreduce(IntWritable key, Iterator values, OutputCollectoroutput,
             Reporterreporter) throws IOException {
          int count =0;
         while(values.hasNext()){
             count +=values.next().get(); //只要后面还有value,count就加一个value（也就是1）
          }
         output.collect(key, new IntWritable(count)); //最后的输出形式就是key和count值
       }
    }

    public intrun(String[] args) throws Exception {
      Configuration conf = getConf();
      
       JobConf job= new JobConf(conf, CitationHistogram.class);
      
       Path in =new Path(args[0]);
       Path out =new Path(args[1]);
      FileInputFormat.setInputPaths(job, in);
      FileOutputFormat.setOutputPath(job, out);
      
      job.setJobName("CitationHistogram");
      job.setMapperClass(MapClass.class);
      job.setReducerClass(Reduce.class);
      
      job.setInputFormat(KeyValueTextInputFormat.class);
      job.setOutputFormat(TextOutputFormat.class);
      job.setOutputKeyClass(IntWritable.class);//因为上面Reducer中的K是IntWritable，这里必须保持一致
      job.setOutputValueClass(IntWritable.class);//因为上面Reducer中的V都是IntWritable，这里必须保持一致
      
      JobClient.runJob(job);
       return0;
    }
   
   public  static void main(String[] args) throwsException{
       int res =ToolRunner.run(new Configuration(), new CitationHistogram(),args);
      
      System.exit(res);
    }
}

在这一次的代码中，Mapper的输入类型还是Text，但key和value的输出类型都变成了IntWritable，为了匹配他，Reduce的输入、输出类型也变成了IntWritable，Driver（run()方法）中也相应改变了输出键和值的类。
除了数据类型的变化外，还少了一行：job.set("key.value.separator.in.input.line");，这是设置KeyValueTextInputFormat的分隔符，这里不再进行设置，默认的是制表符，因为原始数据中的分隔就是tab。

在上述代码中，已经标记了详细的备注，分别注释了每一步操作的内容，接下来配上它的流程图：