第一个程序一般都是Hello World,所以说MapReduce的第一个程序就是单词计数,主要代码如下:
package Temperature;
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
public class WordCount {
/***
*
* 当向MapReduce提交作业的时候,首先文件会被分割成splits,由于我们只是测试
* 所以,只有一个split,然后MapReduce按行将文件切分,<key,value>相当于Python的字典
*
*/
/***
*
* 将上边切割好的<key,value>传递给一下我们自定义的map
* 生成<key, value>
* 上边是按行分的文件数据,这里是按照空格分的行数据
*
*/
public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
/**
* hadoop === java
* BooleanWritable === boolean
* ByteWritable === byte
* ShortWritable === short
* LongWritable === long
* Text === String
* IntWritable === int
* FloatWritable === float
* DoubleWritable === double
* ArrayWritable === Array
* MapWritable === map
*/
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)
throws IOException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
output.collect(this.word, this.one);
}
}
}
/**
*
* 得到<key,value>的值后Mapper会按照key对其进行排序,
* 如果定义了Combine函数,将会对这些排序后的相同的键值进行合并,以后再解析Combine函数,这里先不做解释
* Mapper将<key,value>交给Reducer
* Reduce端首先把收到的数据进行排序,生成<key,[values]>
* 然后交给下面我们自定义的reduce函数处理,最后生成<key,value>键值对输出到hdfs。
*
*/
public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output,
Reporter report) throws IOException {
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));
}
}
public static void main(String [] args) throws Exception {
JobConf conf = new JobConf(WordCount.class);
conf.setJobName("wordcount");
// 配置输出Key和Value的类型
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
//配置Map和Reduce类
conf.setMapperClass(Map.class);
conf.setReducerClass(Reduce.class);
//配置输入输出类
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
//设置输入输出路径
FileInputFormat.setInputPaths(conf, new Path("hdfs://192.168.1.51:9000/input/qixiang_data"));
FileOutputFormat.setOutputPath(conf, new Path("hdfs://192.168.1.51:9000/output/lzh/3"));
//提交作业
JobClient.runJob(conf);
}
}