MapReduce 新旧WordCount 代码解读

最新推荐文章于 2022-03-30 12:34:48 发布

礼彬fly

最新推荐文章于 2022-03-30 12:34:48 发布

阅读量830

点赞数

分类专栏： Bigdatda-Hadoop1.0 文章标签： MapReduce 新旧WordCou

Bigdatda-Hadoop1.0 专栏收录该内容

47 篇文章 0 订阅

订阅专栏

一、mapreduce 的 wordcount 旧API

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
/**
*
* 描述：WordCount explains by Felix
* @author Hadoop Dev Group
*/
public class WordCount
{
/**
* MapReduceBase类:实现了Mapper和Reducer接口的基类（其中的方法只是实现接口，而未作任何事情）
* Mapper接口：
* WritableComparable接口：实现WritableComparable的类可以相互比较。所有被用作key的类应该实现此接口。
* Reporter 则可用于报告整个应用的运行进度，本例中未使用。
*
*/
public static class Map extends MapReduceBase implements
Mapper<LongWritable, Text, Text, IntWritable>
{
/**
* LongWritable, IntWritable, Text 均是 Hadoop 中实现的用于封装 Java 数据类型的类，这些类实现了WritableComparable接口，
* 都能够被串行化从而便于在分布式环境中进行数据交换，你可以将它们分别视为long,int,String 的替代品。
*/
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
/**
* Mapper接口中的map方法：
* void map(K1 key, V1 value, OutputCollector<K2,V2> output, Reporter reporter)
* 映射一个单个的输入k/v对到一个中间的k/v对
* 输出对不需要和输入对是相同的类型，输入对可以映射到0个或多个输出对。
* OutputCollector接口：收集Mapper和Reducer输出的<k,v>对。
* OutputCollector接口的collect(k, v)方法:增加一个(k,v)对到output
*/
public void map(LongWritable key, Text value,
OutputCollector<Text, IntWritable> output, Reporter reporter)
throws IOException
{
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens())
{
word.set(tokenizer.nextToken());
output.collect(word, one);
}
}
}
public static class Reduce extends MapReduceBase implements
Reducer<Text, IntWritable, Text, IntWritable>
{
public void reduce(Text key, Iterator<IntWritable> values,
OutputCollector<Text, IntWritable> output, Reporter reporter)
throws IOException
{
int sum = 0;
while (values.hasNext())
{
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception
{
/**
* JobConf：map/reduce的job配置类，向hadoop框架描述map-reduce执行的工作
* 构造方法：JobConf()、JobConf(Class exampleClass)、JobConf(Configuration conf)等
*/
JobConf conf = new JobConf(WordCount.class);
conf.setJobName("wordcount"); //设置一个用户定义的job名称
conf.setOutputKeyClass(Text.class); //为job的输出数据设置Key类
conf.setOutputValueClass(IntWritable.class); //为job输出设置value类
conf.setMapperClass(Map.class); //为job设置Mapper类
conf.setCombinerClass(Reduce.class); //为job设置Combiner类
conf.setReducerClass(Reduce.class); //为job设置Reduce类
conf.setInputFormat(TextInputFormat.class); //为map-reduce任务设置InputFormat实现类
conf.setOutputFormat(TextOutputFormat.class); //为map-reduce任务设置OutputFormat实现类
/**
* InputFormat描述map-reduce中对job的输入定义
* setInputPaths():为map-reduce job设置路径数组作为输入列表
* setInputPath()：为map-reduce job设置路径数组作为输出列表
*/
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf); //运行一个job
}
}
二、mapreduce 的 wordcount 新API
1. import java.io.IOException;
2. import java.util.StringTokenizer;
4. import org.apache.hadoop.conf.Configuration;
5. import org.apache.hadoop.fs.Path;
6. import org.apache.hadoop.io.IntWritable;
7. import org.apache.hadoop.io.Text;
8. import org.apache.hadoop.mapreduce.Job;
9. import org.apache.hadoop.mapreduce.Mapper;
10. import org.apache.hadoop.mapreduce.Reducer;
11. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
13. import org.apache.hadoop.util.GenericOptionsParser;
15. public class WordCount {
17. /**
18. * TokenizerMapper 继续自 Mapper<Object, Text, Text, IntWritable>
19. *
20. * [一个文件就一个map,两个文件就会有两个map]
21. * map[这里读入输入文件内容以" \t\n\r\f" 进行分割，然后设置 word ==> one 的key/value对]
22. *
23. * @param Object Input key Type:
24. * @param Text Input value Type:
25. * @param Text Output key Type:
26. * @param IntWritable Output value Type:
27. *
28. * Writable的主要特点是它使得Hadoop框架知道对一个Writable类型的对象怎样进行serialize以及deserialize.
29. * WritableComparable在Writable的基础上增加了compareT接口，使得Hadoop框架知道怎样对WritableComparable类型的对象进行排序。
30. *
31. * @author yangchunlong.tw
32. *
33. */
34. public static class TokenizerMapper
35. extends Mapper<Object, Text, Text, IntWritable>{
37. private final static IntWritable one = new IntWritable(1);
38. private Text word = new Text();
39. public void map(Object key, Text value, Context context
40. ) throws IOException, InterruptedException {
41. StringTokenizer itr = new StringTokenizer(value.toString());
42. while (itr.hasMoreTokens()) {
43. word.set(itr.nextToken());
44. context.write(word, one);
45. }
46. }
47. }
49. /**
50. * IntSumReducer 继承自 Reducer<Text,IntWritable,Text,IntWritable>
51. *
52. * [不管几个Map,都只有一个Reduce,这是一个汇总]
53. * reduce[循环所有的map值,把word ==> one 的key/value对进行汇总]
54. *
55. * 这里的key为Mapper设置的word[每一个key/value都会有一次reduce]
56. *
57. * 当循环结束后，最后的确context就是最后的结果.
58. *
59. * @author yangchunlong.tw
60. *
61. */
62. public static class IntSumReducer
63. extends Reducer<Text,IntWritable,Text,IntWritable> {
64. private IntWritable result = new IntWritable();
66. public void reduce(Text key, Iterable<IntWritable> values,
67. Context context
68. ) throws IOException, InterruptedException {
69. int sum = 0;
70. for (IntWritable val : values) {
71. sum += val.get();
72. }
73. result.set(sum);
74. context.write(key, result);
75. }
76. }
78. public static void main(String[] args) throws Exception {
79. Configuration conf = new Configuration();
80. String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
81. /**
82. * 这里必须有输入/输出
83. */
84. if (otherArgs.length != 2) {
85. System.err.println("Usage: wordcount <in> <out>");
86. System.exit(2);
87. }
88. Job job = new Job(conf, "word count");
89. job.setJarByClass(WordCount.class);//主类
90. job.setMapperClass(TokenizerMapper.class);//mapper
91. job.setCombinerClass(IntSumReducer.class);//作业合成类
92. job.setReducerClass(IntSumReducer.class);//reducer
93. job.setOutputKeyClass(Text.class);//设置作业输出数据的关键类
94. job.setOutputValueClass(IntWritable.class);//设置作业输出值类
95. FileInputFormat.addInputPath(job, new Path(otherArgs[0]));//文件输入
96. FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));//文件输出
97. System.exit(job.waitForCompletion(true) ? 0 : 1);//等待完成退出.
98. }
99. }

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。