注意:
- 1.Java语言编写,数据类型需要用Hadoop自带的数据类型!!
- 2.三个文件:map,reduce,main(driver)
需要的jar包
-
$HADOOP_HOME/share/hadoop/common/ $HADOOP_HOME/share/hadoop/common/lib $HADOOP_HOME/share/hadoop/mapredudce $HADOOP_HOME/share/hadoop/mapredudce/lib
WordCountMapper
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
//继承Hadoop的mapper类
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
@Override
protected void map(LongWritable key1, Text value1, Context context)
throws IOException, InterruptedException {
// context:Map的上下文,上面:HDFS,下面:Reduce
//获取数据
String data = value1.toString();
//数据拆分,空格拆分为字符串数组
String[] words = data.split(" ");
//输出: <k2 v2>
for(String word:words){
context.write(new Text(word), new IntWritable(1));
}
}
}
WordCountReducer
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
//继承Hadoop的Reducer类,<k2,v2>和<k3,v3>数据类型相同!
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text k3, Iterable<IntWritable> v3,Context context) throws IOException, InterruptedException {
//context 代表reduce的上下文,上:map ,下:HDFS
//进行求和: 对同一个单词进行求和,对value3求和就能得到某个单词总的次数
//很重要的原则:相同的key2 他的value2会被同一个Reduce处理
int sum = 0;
for(IntWritable v:v3){
sum = v.get() +sum;
}
//输出结果 :k4单词 v4 总的次数
context.write(k3, new IntWritable(sum));
}
}
WordCountMain
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
// 主程序
public class WordCountMain {
public static void main(String[] args) throws Exception {
//创建一个job
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//任务的入口:main的类
job.setJarByClass(WordCountJob.class);
//指定job的map和输出的数据类型(k2 v2)
job.setMapperClass(WordCountMapper.class);
job.setMapOutputKeyClass(Text.class); //k2
job.setMapOutputValueClass(IntWritable.class); //v2
//指定job的reduce和输出的数据类型(k4 v4)
job.setReducerClass(WordCountReducer.class);
job.setOutputKeyClass(Text.class); //key4
job.setOutputValueClass(IntWritable.class); //value4
//指定任务的HDFS的输入 HDFS的输出, 参数在调用jar包时就是hdfs的输入输出目录
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//提交任务
job.waitForCompletion(true);
}
}