改写Hadoop的wordcount程序

最新推荐文章于 2022-06-22 00:00:47 发布

csdidi

最新推荐文章于 2022-06-22 00:00:47 发布

阅读量2.6k

点赞数

分类专栏：其他文章标签： hadoop exception import string path eclipse

本文链接：https://blog.csdn.net/csdidi/article/details/6336987

版权

其他专栏收录该内容

10 篇文章 0 订阅

订阅专栏

实验内容与要求

1. 在Eclipse环境下编写WordCount程序，统计所有除Stop-Word（如a, an, of, in, on, the, this, that,…)外出现次数k次以上的单词计

数，最后的结果按照词频从高到低排序输出；

2. 在集群上运行程序，对莎士比亚文集文档数据进行处理；

3. 可自行建立一个Stop-Word列表文件，其中包含部分停词即可，不需要列出全部停词；参数k作为输入参数动态指定（如k=10）

代码：

            import java.io.*;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.mapred.lib.InverseMapper;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.*;
import org.apache.hadoop.fs.FileSystem;

public class WordCount extends Configured implements Tool {

    // Map类
   public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
     static enum Counters { INPUT_WORDS }
     private final static IntWritable one = new IntWritable(1);
     private Text word = new Text();
     private Set<String> patternsToSkip = new HashSet<String>();
     private long numRecords = 0;
     private String inputFile;
     private String otherPattern = "[^//w]"; //正则表达式，代表不是0-9, a-z, A-Z的所有其它字符

     /**
      * 覆盖configure方法
      * */
     public void configure(JobConf job) {
       inputFile = job.get("map.input.file");
       if (job.getBoolean("wordcount.skip.patterns", false)) {
         Path[] patternsFiles = new Path[0];
         try {
           patternsFiles = DistributedCache.getLocalCacheFiles(job);
         } catch (IOException ioe) {
           System.err.println("Caught exception while getting cached files: " + StringUtils.stringifyException(ioe));
         }
         for (Path patternsFile : patternsFiles) {
           parseSkipFile(patternsFile);
         }
       }
     }
     /**
      * 获取停词文件，将停词加入停词列表中
      * */
     private void parseSkipFile(Path patternsFile) {
       try {
         BufferedReader fis = new BufferedReader(new FileReader(patternsFile.toString()));
         String pattern = null;
         while ((pattern = fis.readLine()) != null) {
           patternsToSkip.add(pattern);
         }
       } catch (IOException ioe) {
         System.err.println("Caught exception while parsing the cached file '" + patternsFile + "' : " + StringUtils.stringifyException(ioe));
       }
     }
     public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
       String line = value.toString();
       line = line.replaceAll(otherPattern, " ");                //将非0-9, a-z, A-Z的字符替换为空格
       for (String pattern : patternsToSkip) {
         line = line.replaceAll(pattern, "");
       }
       StringTokenizer tokenizer = new StringTokenizer(line);
       while (tokenizer.hasMoreTokens()) {
         word.set(tokenizer.nextToken());
         output.collect(word, one);
         reporter.incrCounter(Counters.INPUT_WORDS, 1);
       }
       if ((++numRecords % 100) == 0) {
         reporter.setStatus("Finished processing " + numRecords + " records " + "from the input file: " + inputFile);
       }
     }
   }

   // Reduce类
   public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
       private int k;
           /**
          * 覆盖configure方法
          * */
         public void configure(JobConf job) {
           k = job.getInt("k", 0);
         }

     public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
       int sum = 0;
       while (values.hasNext()) {
         sum += values.next().get();
       }
       if(sum > k)
           output.collect(key, new IntWritable(sum));
     }
   }

   private static class IntWritableDecreasingComparator extends IntWritable.Comparator {
           public int compare(WritableComparable a, WritableComparable b) {
                     return -super.compare(a, b);
           }

       public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
        return -super.compare(b1, s1, l1, b2, s2, l2);
        }
   }

   // run方法
   public int run(String[] args) throws Exception {
    // 新建一个任务，用来完成对文集中单词的词频统计、词频阈值过滤和停词处理
     JobConf conf = new JobConf(getConf(), WordCount.class);
     conf.setJobName("word count");
     conf.setOutputKeyClass(Text.class);
     conf.setOutputValueClass(IntWritable.class);
     conf.setMapperClass(Map.class);
     conf.setReducerClass(Reduce.class);
     conf.setInputFormat(TextInputFormat.class);
     List<String> other_args = new ArrayList<String>();
     for (int i=0; i < args.length; ++i) {
       if ("-skip".equals(args[i])) {
         DistributedCache.addCacheFile(new Path(args[++i]).toUri(), conf);
         conf.setBoolean("wordcount.skip.patterns", true);
       } else {
         other_args.add(args[i]);
       }
     }
     FileInputFormat.setInputPaths(conf, new Path(other_args.get(0)));;
     Path tempDir = new Path("WordCount_temp" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));             //定义一个临时目录
     FileOutputFormat.setOutputPath(conf, tempDir);                            //先将词频统计任务的输出结果写到临时目录中, 下一个排序任务以临时目录为输入目录。
     conf.setOutputFormat(SequenceFileOutputFormat.class);
     JobClient.runJob(conf);

     // 排序任务：根据每个单词出现的词频从高到低进行排序
     JobConf sortJob = new JobConf(getConf(), WordCount.class);
     sortJob.setJobName("sort");
     sortJob.setJarByClass(WordCount.class);
     FileInputFormat.setInputPaths(sortJob, tempDir);
     sortJob.setInputFormat(SequenceFileInputFormat.class);
     /*InverseMapper由hadoop库提供，作用是实现map()之后的数据对的key和value交换*/
     sortJob.setMapperClass(InverseMapper.class);
     /*将 Reducer 的个数限定为1, 最终输出的结果文件就是一个。*/
     sortJob.setNumReduceTasks(1);
     FileOutputFormat.setOutputPath(sortJob, new Path(other_args.get(1)));
     sortJob.setOutputFormat(TextOutputFormat.class);
     sortJob.setOutputKeyClass(IntWritable.class);
     sortJob.setOutputValueClass(Text.class);
     /*Hadoop 默认对 IntWritable 按升序排序，而我们需要的是按降序排列。
      * 因此我们实现了一个 IntWritableDecreasingComparator 类,　
      * 并指定使用这个自定义的 Comparator 类对输出结果中的 key (词频)进行排序*/
     sortJob.setOutputKeyComparatorClass(IntWritableDecreasingComparator.class);
     JobClient.runJob(sortJob);
     FileSystem.get(conf).delete(tempDir); //删除临时目录
     return 0;

   }

   // main方法
   public static void main(String[] args) throws Exception {
     int res = ToolRunner.run(new Configuration(), new WordCount(), args);
     System.exit(res);
   }
}

在Eclipse中导成jar包输出，包名为WordCount.jar

运行：

hadoop namenode -format

start-all.sh

jps

hadoop dfs -mkdir test-input

hadoop dfs -copyFromLocal XXX文件夹 test-input

hadoop dfs -copyFromLocal patterns.txt（停词文件） tempDir

hadoop jar WordCount.jar -D k=10（参数，可调整） test-input/XXX文件夹名称 test-output -skip tempDir/patterns.txt

hadoop dfs -get test-output/part-00000 output.txt

参考资料：

http://blog.chinaunix.net/space.php?uid=20761674&do=blog&cuid=2157576

http://blog.csdn.net/xw13106209/archive/2011/01/07/6122719.aspx

本文为笔者备忘所用，放在网上供大家参考。请勿直接抄袭，谢谢！

csdidi

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
改写Hadoop的wordcount程序

实验内容与要求1. 在Eclipse环境下编写WordCount程序，统计所有除Stop-Word（如a, an, of, in, on, the, this, that,…)外出现次数k次以上的单词计数，最后的结果按照词频从高到低排序输出；2. 在集群上运行程序，对莎士比亚文集文档数据进行处理；3. 可自行建立一个Stop-Word列表文件，其中包含部分停词即可，不需要列出全部停词；参数k作为输入参数动态指定（如k=10）代码： import java.io.*;import
复制链接

扫一扫

专栏目录