使用Hadoop的mapReduce实现计算单词数

最新推荐文章于 2022-05-08 20:05:37 发布

binlixia

最新推荐文章于 2022-05-08 20:05:37 发布

阅读量661

点赞数 1

分类专栏：分布式系统

本文链接：https://blog.csdn.net/binlixia/article/details/47383133

版权

分布式系统专栏收录该内容

13 篇文章 0 订阅

订阅专栏

注意：

每一个map对应一行文本；

只有当所有的map都执行完时，才会执行reduce

因为本次实现的是计算单词的数量，所以在map阶段的输入key的类型是LongWritable类型，输入value的类型是Text类型，输出key的类型是Text类型，输出value的类型是LongWritable;

reduce阶段输出的key类型是Text类型，输出的value是LongWritable

1、要使用map-reduce的框架必须导入以下的包

2、继承Mapper类，重写map方法

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class WCMapper extends Mapper<LongWritable, Text, Text, LongWritable> {

//使用hadoop本身的序列化对象LongWritable、和Text，其实也可以使用jdk自带的序列化对象,只是效率不高，因为jdk的类依赖太多
@Override
protected void map(LongWritable key, Text value,
Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
String []str = value.toString().split(" "); //取一行文本，还有采用空格符来分割
for(String w : str){
context.write(new Text(w), new LongWritable(1)); //出现一个单词，记录一次
}
}

}

3、继承Reducer类，重写reduce方法

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WCReducer extends Reducer<Text, LongWritable, Text, LongWritable>{

@Override
protected void reduce(Text key, Iterable<LongWritable> values,
Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
long counter = 0;
for(LongWritable i:values){ //
counter += i.get(); //Long经过封装之后成了LongWriter，所以使用get方法可以取出Long
}

context.write(key, new LongWritable(counter)); //
}

}

4、提交mapReduce任务

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCount {

/**
* @param args
*/
public static void main(String[] args) throws Exception{
// TODO Auto-generated method stub
Job job = Job.getInstance(new Configuration()); //获取一个job对象
job.setJarByClass(WordCount.class); //这个很重要

job.setMapperClass(WCMapper.class); //设置继承的Mapper类
job.setMapOutputKeyClass(Text.class); //设置map的key的输出类型
job.setMapOutputValueClass(LongWritable.class); //设置map的

FileInputFormat.setInputPaths(job, new Path("/words")); //设置输入的路径

job.setReducerClass(WCReducer.class); //设置继承的Reducer类
job.setOutputKeyClass(Text.class); //设置reduce的输出key类型
job.setOutputValueClass(LongWritable.class); //设置reduce的输出value类型

FileOutputFormat.setOutputPath(job, new Path("/wcount0809")); //设置输出结果的路径

job.waitForCompletion(true); //参数为true，表示输出执行过程
}

}