改写Hadoop的wordcount程序

实验内容与要求

1. 在Eclipse环境下编写WordCount程序,统计所有除Stop-Word(如a, an, of, in, on, the, this, that,…)外出现次数k次以上的单词计

数,最后的结果按照词频从高到低排序输出;

2. 在集群上运行程序,对莎士比亚文集文档数据进行处理;

3. 可自行建立一个Stop-Word列表文件,其中包含部分停词即可,不需要列出全部停词;参数k作为输入参数动态指定(如k=10)

 

代码:

            import java.io.*;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.mapred.lib.InverseMapper;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.*;
import org.apache.hadoop.fs.FileSystem;

public class WordCount extends Configured implements Tool {
   
    // Map类
   public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
     static enum Counters { INPUT_WORDS }
     private final static IntWritable one = new IntWritable(1);
     private Text word = new Text();
     private Set<String> patternsToSkip = new HashSet<String>();
     private long numRecords = 0;
     private String inputFile;
     private String otherPattern = "[^//w]"; //正则表达式,代表不是0-9, a-z, A-Z的所有其它字符
    
     /**
      * 覆盖configure方法
      * */
     public void configure(JobConf job) {
       inputFile = job.get("map.input.file");
       if (job.getBoolean("wordcount.skip.patterns", false)) {
         Path[] patternsFiles = new Path[0];
         try {
           patternsFiles = DistributedCache.getLocalCacheFiles(job);
         } catch (IOException ioe) {
           System.err.println("Caught exception while getting cached files: " + StringUtils.stringifyException(ioe));
         }
         for (Path patternsFile : patternsFiles) {
           parseSkipFile(patternsFile);
         }
       }
     }
     /**
      * 获取停词文件,将停词加入停词列表中
      * */
     private void parseSkipFile(Path patternsFile) {
       try {
         BufferedReader fis = new BufferedReader(new FileReader(patternsFile.toString()));
         String pattern = null;
         while ((pattern = fis.readLine()) != null) {
           patternsToSkip.add(pattern);
         }
       } catch (IOException ioe) {
         System.err.println("Caught exception while parsing the cached file '" + patternsFile + "' : " + StringUtils.stringifyException(ioe));
       }
     }
     public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
       String line = value.toString();
       line = line.replaceAll(otherPattern, " ");                //将非0-9, a-z, A-Z的字符替换为空格
       for (String pattern : patternsToSkip) {
         line = line.replaceAll(pattern, "");
       }
       StringTokenizer tokenizer = new StringTokenizer(line);
       while (tokenizer.hasMoreTokens()) {
         word.set(tokenizer.nextToken());
         output.collect(word, one);
         reporter.incrCounter(Counters.INPUT_WORDS, 1);
       }
       if ((++numRecords % 100) == 0) {
         reporter.setStatus("Finished processing " + numRecords + " records " + "from the input file: " + inputFile);
       }
     }
   }
  
   // Reduce类
   public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
       private int k;
            /**
          * 覆盖configure方法
          * */
         public void configure(JobConf job) {
           k = job.getInt("k", 0);  
         }
        
     public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
       int sum = 0;
       while (values.hasNext()) {
         sum += values.next().get();
       }
       if(sum > k)
           output.collect(key, new IntWritable(sum));
     }
   }
  
   private static class IntWritableDecreasingComparator extends IntWritable.Comparator { 
           public int compare(WritableComparable a, WritableComparable b) { 
                     return -super.compare(a, b); 
           }
          
       public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { 
        return -super.compare(b1, s1, l1, b2, s2, l2); 
        } 
   }
  
   // run方法
   public int run(String[] args) throws Exception {
     // 新建一个任务,用来完成对文集中单词的词频统计、词频阈值过滤和停词处理
     JobConf conf = new JobConf(getConf(), WordCount.class);
     conf.setJobName("word count");
     conf.setOutputKeyClass(Text.class);
     conf.setOutputValueClass(IntWritable.class);
     conf.setMapperClass(Map.class);
     conf.setReducerClass(Reduce.class);
     conf.setInputFormat(TextInputFormat.class);
     List<String> other_args = new ArrayList<String>();
     for (int i=0; i < args.length; ++i) {
       if ("-skip".equals(args[i])) {
         DistributedCache.addCacheFile(new Path(args[++i]).toUri(), conf);
         conf.setBoolean("wordcount.skip.patterns", true);
       } else {
         other_args.add(args[i]);
       }
     }
     FileInputFormat.setInputPaths(conf, new Path(other_args.get(0)));;
     Path tempDir = new Path("WordCount_temp" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));             //定义一个临时目录
     FileOutputFormat.setOutputPath(conf, tempDir);                            //先将词频统计任务的输出结果写到临时目录中, 下一个排序任务以临时目录为输入目录。 
     conf.setOutputFormat(SequenceFileOutputFormat.class);
     JobClient.runJob(conf);
    
     // 排序任务:根据每个单词出现的词频从高到低进行排序
     JobConf sortJob = new JobConf(getConf(), WordCount.class);
     sortJob.setJobName("sort");
     sortJob.setJarByClass(WordCount.class);
     FileInputFormat.setInputPaths(sortJob, tempDir);
     sortJob.setInputFormat(SequenceFileInputFormat.class);
     /*InverseMapper由hadoop库提供,作用是实现map()之后的数据对的key和value交换*/ 
     sortJob.setMapperClass(InverseMapper.class);
     /*将 Reducer 的个数限定为1, 最终输出的结果文件就是一个。*/
     sortJob.setNumReduceTasks(1);
     FileOutputFormat.setOutputPath(sortJob, new Path(other_args.get(1)));
     sortJob.setOutputFormat(TextOutputFormat.class);
     sortJob.setOutputKeyClass(IntWritable.class); 
     sortJob.setOutputValueClass(Text.class);
     /*Hadoop 默认对 IntWritable 按升序排序,而我们需要的是按降序排列。
      * 因此我们实现了一个 IntWritableDecreasingComparator 类, 
      * 并指定使用这个自定义的 Comparator 类对输出结果中的 key (词频)进行排序*/ 
     sortJob.setOutputKeyComparatorClass(IntWritableDecreasingComparator.class);
     JobClient.runJob(sortJob);
     FileSystem.get(conf).delete(tempDir); //删除临时目录
     return 0; 
    
   }
  
   // main方法
   public static void main(String[] args) throws Exception {
     int res = ToolRunner.run(new Configuration(), new WordCount(), args);
     System.exit(res);
   }
}

 

在Eclipse中导成jar包输出,包名为WordCount.jar

运行:

hadoop namenode -format

start-all.sh

jps

hadoop dfs -mkdir test-input

hadoop dfs -copyFromLocal XXX文件夹 test-input

hadoop dfs -copyFromLocal patterns.txt(停词文件) tempDir

hadoop jar WordCount.jar -D k=10(参数,可调整) test-input/XXX文件夹名称 test-output -skip tempDir/patterns.txt

hadoop dfs -get test-output/part-00000 output.txt

 

参考资料:

http://blog.chinaunix.net/space.php?uid=20761674&do=blog&cuid=2157576

http://blog.csdn.net/xw13106209/archive/2011/01/07/6122719.aspx

 

本文为笔者备忘所用,放在网上供大家参考。请勿直接抄袭,谢谢!

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值