hadoop2.0 wordcount代码讲解

最新推荐文章于 2024-08-14 11:53:07 发布

礼彬fly

最新推荐文章于 2024-08-14 11:53:07 发布

阅读量1.7k

点赞数

分类专栏： Java Bigdatda-Hadoop2.0 Hadoop旅程文章标签： hadoop2.0 wordwount

本文链接：https://blog.csdn.net/baolibin528/article/details/43416643

版权

Java 同时被 3 个专栏收录

42 篇文章 0 订阅

订阅专栏

Bigdatda-Hadoop2.0

30 篇文章 0 订阅

订阅专栏

Hadoop旅程

27 篇文章 8 订阅

订阅专栏

hadoop2.0 wordcount代码讲解

本代码从hadoop-2.6.0里面拷贝出来的。

完整代码如下：

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.hadoop.examples;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class WordCount {

	public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable>{ 
 		private final static IntWritable one = new IntWritable(1);  
		private Text word = new Text();  
 		public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
      			StringTokenizer itr = new StringTokenizer(value.toString());
     			while (itr.hasMoreTokens()) {
        			word.set(itr.nextToken());
       				context.write(word, one);
      			}
  	      }
         }
  
  	public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
          	private IntWritable result = new IntWritable();
            	public void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
                     	int sum = 0;
                     	for (IntWritable val : values) {
                            	sum += val.get();
               		}
               		result.set(sum);
               		context.write(key, result);
               }
        }

        public static void main(String[] args) throws Exception {
       		Configuration conf = new Configuration();
    		String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
   		if (otherArgs.length < 2) {
     	        	System.err.println("Usage: wordcount <in> [<in>...] <out>");
     			System.exit(2);
   		 }
   
		 Job job = new Job(conf, "word count");
    		 job.setJarByClass(WordCount.class);
    		 job.setMapperClass(TokenizerMapper.class);
   		 job.setCombinerClass(IntSumReducer.class);
    		 job.setReducerClass(IntSumReducer.class);
    		 job.setOutputKeyClass(Text.class);
    		 job.setOutputValueClass(IntWritable.class);
   		 for (int i = 0; i < otherArgs.length - 1; ++i) {
     			 FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
   		 }
   		 FileOutputFormat.setOutputPath(job,new Path(otherArgs[otherArgs.length - 1]));
   		 System.exit(job.waitForCompletion(true) ? 0 : 1);
 	 }

}

对代码进行部分注释讲解：

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.hadoop.examples;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class WordCount {

	//四个参数，前两个为输入<key,value>对，后两个为输出<key,value>对;
	//LongWritable、IntWritable、Text可视为Java 的long、int、String替代品;
	public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable>{
    		//一个标记单词个数的常量，值为1，这个常量也可以不定义，在后面程序直接用整数1代替，private final static定义的是常量;
  		private final static IntWritable one = new IntWritable(1);
   		//充当中间变量，存储词;
		private Text word = new Text();
   		//map方法，key为偏移量，对value进行拆分，<span style="font-family: Arial, Helvetica, sans-serif;">context为上下文机制;</span>
 		public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
   			//对转换的字符串进行分隔; 
  			StringTokenizer itr = new StringTokenizer(value.toString());
     			//利用循环函数进行依次处理;
			while (itr.hasMoreTokens()) {
 				//返回从当前位置到下一个分隔符的字符串;     
  				word.set(itr.nextToken());
  				//如 context.write("hello",1);
  				context.write(word, one);    
 			}
   	      }
         }
  
	//四个参数，前两个为输入<key,value>对，后两个为输出<key,value>对;
   	public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
  		//定义一个变量;
        	private IntWritable result = new IntWritable();
		//reduce方法，key为如 "hello"，Iterable遍历所有key的个数;
            	public void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
    			//  用于记录key个数的变量;
                	int sum = 0;
   			//求key的个数; 
                 	for (IntWritable val : values) {
                             	sum += val.get();
               		}
   			//把sum个数存到result中去;
              		result.set(sum);
			//如 context.write("hello",7);      
              		context.write(key, result);
                }
        }


	//主方法;
        public static void main(String[] args) throws Exception {
   		//指定作业执行规范;
       		Configuration conf = new Configuration();
    		//这里需要配置参数即输入和输出的HDFS的文件路径 
		String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
   		if (otherArgs.length < 2) {
     	        	System.err.println("Usage: wordcount <in> [<in>...] <out>");
     			System.exit(2);
   		}
   		//设置Job名称、运行对象;
		Job job = new Job(conf, "word count");
    		job.setJarByClass(WordCount.class);
   		//为job设置map类;
 		job.setMapperClass(TokenizerMapper.class);
 		//为job设置Combiner类;
 		job.setCombinerClass(IntSumReducer.class);
   		//为job设置 reduce类;
 		job.setReducerClass(IntSumReducer.class);
    		//设置输出key类型;
		job.setOutputKeyClass(Text.class);
   		//设置输出value类型;
 		job.setOutputValueClass(IntWritable.class);
  		//设置输入路径;
 		for (int i = 0; i < otherArgs.length - 1; ++i) {
     			FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
   		}
  		//设置输出路径;
 		FileOutputFormat.setOutputPath(job,new Path(otherArgs[otherArgs.length - 1]));
   		System.exit(job.waitForCompletion(true) ? 0 : 1);
 	 }

}