- 创建maven工程并加入hadoop依赖
<dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-core</artifactId> <version>0.20.2</version> </dependency> <repositories> <repository> <id>cloudera</id> <url>https://repository.cloudera.com/content/groups/public</url> </repository> </repositories>
- 概述
- 编写Map类
public class WordCountMapper extends Mapper <Object, Text, Text, IntWritable>{ private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(Object key, Text value, Context context ) throws IOException, InterruptedException { StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); context.write(word, one); } } }
- 编写Reduce类
public class WordCountReducer extends Reducer <Text, IntWritable, Text, IntWritable> { private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } }
- 定义job
public class WordCount { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount "); System.exit(2); } /**创建一个job,起个名字以便跟踪查看任务执行情况**/ Job job = new Job(conf, "word count"); /**当在hadoop集群上运行作业时,需要把代码打包成一个jar文件(hadoop会在集群分发这个文件),通过job的setJarByClass设置一个类,hadoop根据这个类找到所在的jar文件**/ job.setJarByClass(WordCount.class); /**设置要使用的map、combiner、reduce类型**/ job.setMapperClass(WordCountMapper.class); job.setCombinerClass(WordCountReducer.class); job.setReducerClass(WordCountReducer.class); /**设置map和reduce函数的输入类型,这里没有代码是因为我们使用默认的TextInputFormat,针对文本文件,按行将文本文件切割成 InputSplits, 并用 LineRecordReader 将 InputSplit 解析成 <key,value>: 对,key 是行在文件中的位置,value 是文件中的一行**/ /**设置map和reduce函数的输出键和输出值类型**/ job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); /**设置输入和输出路径**/ FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); /**提交作业并等待它完成**/ System.exit(job.waitForCompletion(true) ? 0 : 1); } }
hadoop学习笔记之二:MapReduce基本编程
引言
在本系列的
上篇文章
中介绍了Hadoop的基本概念和架构,本文将通过一个实例演示MapReduce基本编程。在继续进行前希望能重温下前面的内容,至少理解
这张图
是怎么回事。
实践