完整代码
package edu.qfnu.hadoop;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.*;
public class SoGou {
public static void main (String[] args)throws IOException,ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(SoGou.class);
job.setMapperClass(SoGou.SGMapper.class);
job.setReducerClass(SoGou.SGReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
FileInputFormat.addInputPath(job, new Path("/input3"));
FileOutputFormat.setOutputPath(job, new Path("/output3"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
public static class SGMapper extends Mapper<LongWritable,Text,Text, LongWritable>{
private static final LongWritable one = new LongWritable(1);
private Text text = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] line = value.toString().split("\t");
String keys = line[2];
text.set(keys);
context.write(text,one);
}
}
public static class SGReducer extends Reducer<Text,LongWritable,Text,LongWritable>{
Text text = new Text();
HashMap<String,Integer> map = new HashMap<>();
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;//key出现次数
for (LongWritable Itext:values){
sum += Itext.get();
}
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
List<Map.Entry<String,Integer>> list = new LinkedList<Map.Entry<String,Integer>>(map.entrySet());
Collections.sort(list,new Comparator<Map.Entry<String,Integer>>(){
@Override
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
return (int) (o2.getValue() - o1.getValue());
}
});
for (Map.Entry<String,Integer> e:list){
context.write(new Text(e.getKey()),new LongWritable(e.getValue()));
}
}
}
}
分析
map是读一行执行一次,reduce是每一组执行一次,只用map的reduce是无法控制输出的次数的,无论map或者reduce都有 setUp 和cleanUp而且这两个执行一次所以我们可以在reduce阶段把每一个单词当做key,单词出现的次数当做value,每一组存放到一个map里面,此时只存,不写出。在reduce的cleanUp阶,map排序,然后输出前三名
1.Configuration。运行MapReduce程序之前,需要先初始化Configuration类,作用是读取MapReduce的系统配置信息(包括HDFS的和MapReduce的),即安装Hadoop时的配置文件信息,如core-site.xml,hdfs-site.xml和mapred-site.xml等文件中的信息,代码如下:
Configuration conf = new Configuration();
2.Job。通过实例化对象Job,可以构建一个任务对象。编写MapReduce程序需要实现三个函数类
Map和Reduce类,以及负责配置MapReduce如何运行Map和Reduce的函数
Job job = Job.getInstance(conf,"word count"); //conf用于存放Job的配置信息,“word count ”是Job的名称
job.setJarByClass(SouGo1.class); //装载编写完成的计算机程序,如本例中的程序类名SouGo.class
job.setMapperClass(WCMapper.class); //装载Map函数的实现类
job.setCombinerClass(WCReducer.class); //这个类和MapReduce运行机制有关,去掉也没有关系,但有会使效率更高
job.setReducerClass(WCReducer.class); //装载Reduce函数的实现类
//下面两行代码定义了输入和输出的路径
job.setOutputKeyCalss(Text.class); //构建输入的数据文件也就是最终存储在hdfs上结果文件的key/value类型
job.setOutPutValueClass(IntWritable.class); //构建输出的数据文件
3.Text:Text类型使用变长int型存储长度,采用utf-8的编码格式,String是Unicode编码格式
4.LongWritable:longwritable的key是线的偏移量,表示该行在文件中的位置而不是行号,IntWritable的key是行号
5.Context:上下文
6.tree_map.keySet()该方法以升序返回一个具有TreeMap键的集合,TreeMap内部通过二叉树的原理来保证键的唯一性,treeMap的所有键是按照某种顺序排列的,Map接口的get方法:
Object get(Object key) : 返回指定键所映射的值,如果映射不包含该键的映射关系,则返回null
7.entrySet()返回映射所包含的映射关系的Set集合(一个关系就是一个键值对),就是把key-value作为一个整体一对一的存放到Set集合中
Map函数
map(LongWritable key, Text value, Context context)三个参数,key和valu就是输入的key和value,context是记录输入的key和value,context通过write(key,value)方法添加参数
Mapper<KEYIN,VALUEIN,KEYOUT,VALUEOUT>,是泛函,只需要类型即可,进入map函数的key是行号,value是每行内容,出去的key是每个单词,value是单词个数所组成的列表,map()是一个方法名指定参数类型,和参数
public static class SGMapper extends Mapper<LongWritable,Text,Text, LongWritable>{
private static final LongWritable one = new LongWritable(1);//分割后的每个单词的value为1
private Text text = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] line = value.toString().split("\t"); //对每行数据进行分割,分割符为“\t”
String keys = line[2]; //取每行的第三个
text.set(keys); //把keys赋给text
context.write(text,one); //在context中把键值对写入
}
}
Reduce函数
reduc函数输入的也是一个key/value的形式,不过它的value
是一个迭代器形式Iterator<LongWritable>(或者<IntWritable>但要与前面的map函数相对应)
public static class SGReducer extends Reducer<Text,LongWritable,Text,LongWritable>{
Text text = new Text();
HashMap<String,Integer> map = new HashMap<>(); //新建HashMap存储数据,HashMap的随机存取速度是Java集合中最快的
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;//key出现次数
for (LongWritable Itext:values){ //对Map结果进行合并,得出词频的统计值
sum += Itext.get();
}
}
Reduce接收的数据形式为(即由Map输出值合成的数组)为<Bye,1,1,1,1><Hadoop,1,1,1,1,1,1,1><Hello,1,1>,随后Reduce函数会将这个变量值数组中的值循环相加 ,分别统计每个单词出现的总次数,最后输出结果为<Bye,4><Hadoop,7><Hello,2>
Reduce函数的cleanup()方法
protected void cleanup(Context context) throws IOException, InterruptedException {
List<Map.Entry<String,Integer>> list = new LinkedList<Map.Entry<String,Integer>>(map.entrySet());
Collections.sort(list,new Comparator<Map.Entry<String,Integer>>(){
@Override//使用collections工具类,用自定义的排序方法对结果按照value值,即单词个数降序排列
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
return (int) (o2.getValue() - o1.getValue());
}
});
for (Map.Entry<String,Integer> e:list){
context.write(new Text(e.getKey()),new LongWritable(e.getValue()));
//将单词与统计值输出到文件中
}
}
}
数据
链接:https://pan.baidu.com/s/1iEjTtCOEP4pPjYzlA6LoKw
提取码:uhlo
一篇关于MapReduce的好文章
https://www.cnblogs.com/sharpxiajun/p/3151395.html