mapreduce规约

图解

在这里插入图片描述
规约主要是为了减少网络传输阶段的负担

准备工作

准备wordcount.txt,最好数据能重复多一点,能看到其中的效果。

规约前
在这里插入图片描述
规约后,reduce input明显减少
在这里插入图片描述

java代码

WordCountMapper.java


import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;


/*
Mapper泛型:
        keyin:k1的类型     行偏移量 LongWritable
        valuein:v1的类型   一行文本数据 Text
        keyout:k2的类型    每个单词 Text
        valueout:v2的类型  固定值 1 LongWritable
 */

public class WordCountMapper extends Mapper<LongWritable, Text,Text,LongWritable> {

    /*
    map的方法是将k1,v1转化为k2,v2
    key:k1
    value:v1
    Context:Mapreduce的上下文对象
     */

    /*
        k1      v1
        0       hello,world
        11      hello,hadoop
---------------------------------------------------------
        k2      v2
        hello   1
        world   1
        hello   1
        hadoop  1
     */

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {


        Text text = new Text();
        // 对每一行数据字符进行拆分
        String line = value.toString();
        String[] split = line.split(",");
        // 遍历数组,获取每个单词
        for (String word : split){  // word 1
            text.set(word);
            context.write(text,new LongWritable(1));

        }
    }
}

WordCountReducer.java

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;


/*
keyin:k2    Text    每个单词
valuein:v2  LongWritable 集合中泛型的类型   <1,1>
keyout:k3   Text    每个单词
value:v3    Longwritable    每个单词出现的次数
 */

public class WordCountReducer extends Reducer<Text, LongWritable,Text,LongWritable> {

    /*
    reduce方法是将k2,v2转为k3,v3
    key:k2
    values:集合
    Context:上下文对象
     */

    /*
    新   k2      v2
        hello   <1,1>
        word    <1,1,1>
        hadoop  <1,1>
-------------------------------------------
        k3      v3
        hello   2
        word    3
        hadoop  2
     */
    @Override
    protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {


        long count = 0;     //存储values的值

        //1、遍历values集合
        for (LongWritable value : values) {
            //2、将集合中的值相加
            count += value.get();
        }

        //3、将k3和v3写入上下文中
        context.write(key,new LongWritable(count));
        
    }
}

MyCombiner.java,其实就是提前在maptask做了reducer,减少网络传输的压力

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class MyCombiner extends Reducer<Text, LongWritable,Text,LongWritable> {
    @Override
    protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {

        long count = 0;     //存储values的值

        //1、遍历values集合
        for (LongWritable value : values) {
            //2、将集合中的值相加
            count += value.get();
        }

        //3、将k3和v3写入上下文中
        context.write(key,new LongWritable(count));

    }
}

JobMain.java

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


public class JobMain extends Configured implements Tool {

    @Override
    public int run(String[] strings) throws Exception {

        //创建一个任务对象,jobName可以自己定义
        Job job = Job.getInstance(super.getConf(),"wordcount");     //通过super.getConf()调用Configuration
        //打包放在集群运行时,需要做个配置
        job.setJarByClass(JobMain.class);
        //第一步:设置文件读取类:k1和v1
        job.setInputFormatClass(TextInputFormat.class);     //怎么读
        TextInputFormat.addInputPath(job,new Path("hdfs://master:9000/wordcount"));   //在哪里读

        //第二步:设置maper类
        job.setMapperClass(WordCountMapper.class);  //前面自己写的maper类
        //设置map阶段的输出类型:对应k2,v2的类型
        job.setMapOutputKeyClass(Text.class);   //k2
        job.setMapOutputValueClass(LongWritable.class); //v2

        //第三、四、五、六步采用默认的方式(分区,排序,归悦,分组)
        //设置我们的规约类
        job.setCombinerClass(MyCombiner.class);


        //第七步:设置reduce类
        job.setReducerClass(WordCountReducer.class);    //前面自己写的reducer类
        //设置reduce阶段的输出类型:对应前面的k3,v3类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);

        //设置reduce的个数
        job.setNumReduceTasks(2);

        //第八步:设置输出类
        job.setOutputFormatClass(TextOutputFormat.class);
        //设置输出的路径
        TextOutputFormat.setOutputPath(job,new Path("hdfs://master:9000/out"));

        //如果成功返回0,否则1
        boolean b = job.waitForCompletion(true);
        return b?0:1;

    }
    public static void main(String[] args) throws Exception {
        Configuration configuration = new Configuration();      //获取hdfs配置文件
        int run = ToolRunner.run(configuration,new JobMain(),args);
        System.exit(run);
    }
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值