问题描述:单词统计,统计一个文件,每个单词出现了多少次?
使用技术:mapreduce(map阶段:把输入文件进行切块,切成单词; reduce阶段:进行单词统计)
过程详细分析:
map: 输入:<行偏移量(key),行内容>
LongWritable TEXT
输出:输出:<单词,1>
TEXT LongWritable(1)
package myMapreduceTest;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class myMapTest extends Mapper<LongWritable, Text, Text, LongWritable>{
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
//获取整行
String str = value.toString();
//进行切分
String[] words = str.split(" ");
//输出
for(String word:words){
context.write(new Text(word), new LongWritable(1));
}
}
}
reduce:输入:<单词,[1,1,1,1]>
TEXT LongWritable
{ 求sum (两种方法:①调用迭代器②采用foreach) }
输出:<单词,sum>
package myMapreduceTest;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class myReduceTest extends Reducer<Text, LongWritable, Text, LongWritable>{
@Override//这是个标记注解,表示的是方法重写
protected void reduce(Text key, Iterable<LongWritable> values,
Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
//用迭代器的写法
//建立一个values对象的迭代器,<>中的是泛型
Iterator<LongWritable> iter = values.iterator();
long sum = 0;
//取出迭代器中的值
while(iter.hasNext()){
sum += iter.next().get();
}
context.write(key, new LongWritable(sum));
//用foreach的写法
/*for(LongWritable value:values){
sum += value.get();
}*/
}
}
创建main方法,执行
package myMapreduceTest;
package com.hadoop.mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
public class WCRunner {
public static void main(String[] args) {
// TODO Auto-generated method stub
if (args.length!=2){
System.out.println("Usage: MaxTemperature <input path> <out path>");
System.exit(-1);
}
Configuration conf=new Configuration();
conf.setBoolean("mapreduce.map.output.compress", true);
//conf.setBoolean()将map的输出结果进行压缩,初学者可以自动忽略这句代码,主要是理解mapreduce的思想,压缩将于之后的博客中进行讲解
//conf.setClass("mapred.map.output.compression.codec",GzipCodec.class, CompressionCodec.class);
try {
Job job=Job.getInstance(conf);
//getTnstance()保证一个类仅有一个实例,并提供一个访问它的全局访问点
//指定runner mapper reducer 的执行类
job.setJarByClass(WCRunner.class);
job.setMapperClass(WCMapper.class);
job.setReducerClass(WCReducer.class);
//设置map中key value 的输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setNumReduceTasks(2);
//设置reduce中key value 的输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.waitForCompletion(true);
//工作等待完成,将Job提交
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}