Hadoop之WordCount源代码

原创 2015年11月18日 11:36:39


一、旧版WordCount源代码

//package org.apache.hadoop.examples;
import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;

public class WordCount {
	public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
		private final static IntWritable one = new IntWritable(1);
		private Text word = new Text();
		public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)throws IOException {
			String line = value.toString();
			StringTokenizer tokenizer = new StringTokenizer(line);
			while (tokenizer.hasMoreTokens()) {
				word.set(tokenizer.nextToken());
				output.collect(word, one);
			} //while
		} //map()
	} //static class Map

	public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
		public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter)throws IOException {
			int sum = 0;
			while (values.hasNext()) {
				sum += values.next().get();
			}
			output.collect(key, new IntWritable(sum));
		} //reduce()
	} //static class Reduce
	public static void main(String[] args) throws Exception {
		JobConf conf = new JobConf(WordCount.class);
		conf.setJobName("wordcount");
		conf.setOutputKeyClass(Text.class);
		conf.setOutputValueClass(IntWritable.class);
		conf.setMapperClass(Map.class);
		conf.setCombinerClass(Reduce.class);
		conf.setReducerClass(Reduce.class);
		conf.setInputFormat(TextInputFormat.class);
		conf.setOutputFormat(TextOutputFormat.class);
		FileInputFormat.setInputPaths(conf, new Path(args[0]));
		FileOutputFormat.setOutputPath(conf, new Path(args[1]));
		JobClient.runJob(conf);
	} //main()
} //class WordCount



二、新版WordCount源代码

//package org.apache.hadoop.examples;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class WordCount {
	public static class TokenizerMapper
			extends Mapper<Object, Text, Text, IntWritable>{
			private final static IntWritable one = new IntWritable(1);
			private Text word = new Text();
			public void map(Object key, Text value, Context context)throws IOException, InterruptedException {
				StringTokenizer itr = new StringTokenizer(value.toString());
				while (itr.hasMoreTokens()) {
					word.set(itr.nextToken());
					context.write(word, one);
				} //while
			} //map()
	} //static class TokenizerMapper
	public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
		private IntWritable result = new IntWritable();
		public void reduce(Text key, Iterable<IntWritable> values,Context context)throws IOException, InterruptedException {
			int sum = 0;
			for (IntWritable val : values) {
				sum += val.get();
			}
			result.set(sum);
			context.write(key, result);
		} //reduce
	} //static class IntSumReducer
	public static void main(String[] args) throws Exception {
	Configuration conf = new Configuration();
	String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
	if (otherArgs.length != 2) {
		System.err.println("Usage: wordcount <in> <out>");
		System.exit(2);
	}
	Job job = new Job(conf, "word count");
	job.setJarByClass(WordCount.class);
	job.setMapperClass(TokenizerMapper.class);
	job.setCombinerClass(IntSumReducer.class);
	job.setReducerClass(IntSumReducer.class);
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(IntWritable.class);
	FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
	FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
	System.exit(job.waitForCompletion(true) ? 0 : 1);
} //main()
} //class WordCount





版权声明:本文为博主原创文章,未经博主允许不得转载。

相关文章推荐

大数据系统与大规模数据分析 之 作业二

大数据系统与大规模数据分析 之 作业二 问题描述 Hadoop编程 程序源码大数据系统与大规模数据分析 之 作业二问题描述作业二:Hadoop编程 总体任务 输入文件: 文本文件 source d...

Hadoop的词频统计源代码WordCount

  • 2017年06月03日 15:51
  • 2KB
  • 下载

hadoop wordcount源代码分析

package org.apache.hadoop.examples;   import java.io.IOException; import java.util.StringTokenize...

hadoop源代码分析(二)从wordCount开始,剖析mapreduce的运行机制

在上一篇文章中,只是简单介绍了Mapreduce作业,从执行hadoop jar test.jar 的shell命令,到是如何被加载并找到主类的。那么,从这个文章开始,研究从mapreduce的mai...

查看Hadoop-1.2.1里面的例子jar并对WordCount进行修改

1.查看Hadoop-1.2.1里面的例子jar 进入 /usr/program/hadoop-env/ hadoop-1. 2.1/bin目录下,直接用hadoop命令可以看到所有可以使用的命令 ...

ubuntu运行hadoop的wordcount

  • 2012年09月22日 21:16
  • 543B
  • 下载

hadoop 自学指南三之WordCount解析(2)

一、前言 自从0.20.2版本开始,hadoop 提供了一个新的API,新的API在org.apache.hadoop.mapreduce中,旧的api在org.apache.hadoop.mapre...
内容举报
返回顶部
收藏助手
不良信息举报
您举报文章:Hadoop之WordCount源代码
举报原因:
原因补充:

(最多只允许输入30个字)