Hadoop Map/Reduce是一个使用简易的软件框架,基于它写出来的应用程序能够运行在由上千个商用机器组成的大型集群上,并以一种可靠容错的方式并行处理上T级别的数据集。WordCount.java
package org.myorg;//包含在包myorg中
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
public class WordCount {
public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {//定义映射map类
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();//定义对象word用来包含需要统计的字符串
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
//定义具体的映射方法,传入参数key,value(要统计的字符串),输出参数output(每个单词出现的个数统计映射方式),reporter
String line = value.toString();//一行一行的读入传入的字符串
StringTokenizer tokenizer = new StringTokenizer(line);//以空格分隔符将一行分为若干tokens
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
output.collect(word, one);//统计相同单词出现的次数
}
}
}
public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
//定义reduce类,将每个key(本例中就是单词)出现的次数求和
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
//定义具体的reduce方法,将每次映射中对应每个单词出现的次数进行统计求和,输入参数有key,values,输出参数有output ,reporter
int sum = 0;//初始化总数
while (values.hasNext()) {//将两次映射中对应的次数进行求和
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));//输出output得到最终结果
}
}
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(WordCount.class);//定义JobCon的配置变量conf,代表一个Map/Reduce作业的配置。
conf.setJobName("wordcount");
conf.setOutputKeyClass(Text.class);//框架随后会把与一个特定key关联的所有中间过程的值(value)分成组,然后把它们传给Reducer以产出最终的结果。用户可以通过 JobConf.setOutputKeyComparatorClass(Class)来指定具体负责分组的 Comparator。
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(Map.class);//Mapper已经排好序的输出
conf.setCombinerClass(Reduce.class);//用户可选择通过 JobConf.setCombinerClass(Class)指定一个combiner,它负责对中间过程的输出进行本地的聚集,这会有助于降低从Mapper到 Reducer数据传输量。
conf.setReducerClass(Reduce.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));//定义文件的输入格式,指定一组输入文件
FileOutputFormat.setOutputPath(conf, new Path(args[1]));//定义文件的输出格式,输出文件应该写在哪儿
JobClient.runJob(conf);//提交作业并且监控它的执行
}
}
package org.myorg;
import java.io.*;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
public class WordCount extends Configured implements Tool {//定义单词统计WordCount类
public static class Map extends MapReduceBase implements Mapper<LongWritable, Text,
Text,IntWritable>{//定义映射map类
static enum Counters{ INPUT_WORDS }
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();//定义对象word用来包含需要统计的字符串
//初始化每个统计变量
private boolean caseSensitive = true;
private Set<string> patternsToSkip = new HashSet<String>();
private long numRecords = 0;
private String inputFile;
public void configure(JobConf job) {//修改配置参数
caseSensitive = job.getBoolean("wordcount.case.sensitive", true);
inputFile = job.get("map.input.file");
if (job.getBoolean("wordcount.skip.patterns", false)){
path[] patternsFiles = new Path[0];
try{
patternsFiles = DistributedCache.getLocalCacheFiles(job);//使用DistributedCache 来分发只读数据。 这里允许用户指定单词的模式,在计数时忽略那些符合模式的单词
} catch (IOException ioe){
System.err.println("Caught exception while getting cached files: " +
StringUtils.stringifyException(ioe));
}
for (Path patternsFile : patternsFiles){
parseSkipFile(patternsFile);
}
}
}
private void parseSkipFile(Path patternsFile) {//从文件中读入字符串
try {
BufferedReader fis = new BufferedRdader(new FileReader(patternsFile.toString()));//读入数据
String pattern = null;
while ((pattern = fis.readLine()) != null){
patternsToSkip.add(pattern);//将读入数据保存到patternsToSkip
}
}catch (IOException ioe){
System.err.println("Caught exception while parsing the cached file '" +
patternsFile + "' :" + StringUtils.stringifyException(ioe));
}
}
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable>
output, Reporter reporter) throws IOException {
String line = (caseSensitive) ? value.toString() : value.toString().toLowerCase();//一行一行的读入传入的字符串
for (String pattern : patternsToSkip) {
line = line.replaceAll(pattern, "");
}
StringTokenizer tokenizer = new StringTokenizer(line);//以空格分隔符将一行分为若干tokens
while (tokenizer.hasMoreTokens()) {//统计相同单词出现的次数
word.set(tokenizer.nextToken());
output.collect(word, one);//统计相同单词出现的次数
reporter.incrCounter(Counters.INPUT_WORDS, 1);//遇到相同的进行+1操作
}
if((++numRecords % 100) == 0) {//打印最后的统计结果
reporter.setStatus("Finished processing "+ numRecords + " records" + "from the input file: "+ inputFile);
}
}
}
public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable,
Text, IntWritable> {//定义映射之间总数的统计
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output,
Reporter reporter) throws IOException {定义具体的reduce方法,将每次映射中对应每个单词出现的次数进行统计求和,输入参数有key,values,输出参数有output
int sum = 0;//初始化总数
while (values.hasNext()) {//将两次映射中对应的次数进行求和
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));//输出output得到最终结果
}
}
public int run(String[] args) throws Exception {//执行统计方法run
JobConf conf = new JobConf(getConf(), WordCount.class);//定义JobCon的配置变量conf,代表一个Map/Reduce作业的配置。
conf.setJobName("wordcount");
conf.setOutputKeyClass(Text.class);//框架随后会把与一个特定key关联的所有中间过程的值(value)分成组,然后把它们传给Reducer以产出最终的结果。用户可以通过 JobConf.setOutputKeyComparatorClass(Class)来指定具体负责分组的 Comparator。
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(Map.class);;//Mapper已经排好序的输出
conf.setCombinerClass(Reduce.class);//用户可选择通过 JobConf.setCombinerClass(Class)指定一个combiner,它负责对中间过程的输出进行本地的聚集,这会有助于降低从Mapper到 Reducer数据传输量。
conf.setReducerClass(Reduce.class);
conf.setIntputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutpotFormat.class);
List<String> other_args = new ArrayList<String>();
for (int i=0; i < args.length; ++i) {
if ("-skip".equals(args[i])) {
DistributedCache.addCacheFile(new path(args[++i]).toUri(), conf);//使用DistributedCache 来分发只读数据。 这里允许用户指定单词的模式,在计数时忽略那些符合模式的单词
conf.setBoolean("wordcount.skip.patterns", true);
} else{
other_args.add(args[i]);
}
}
FileInputFormat.setInputPaths(conf, new Path(other_args.get(0)));//定义文件的输入格式,指定一组输入文件
FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));//定义文件的输出格式,输出文件应该写在哪儿
JobClient.runJob(conf);//提交作业并且监控它的执行
return 0;
}
public static void main(String[] args) throws Exception{
int res = ToolRunner.run(new Configuration(), new WordCount(), args);//执行wordcount统计
System.exit(res);
}
}