mapReduce知识点（1）

最新推荐文章于 2022-09-17 18:11:54 发布

_知多少_

最新推荐文章于 2022-09-17 18:11:54 发布

阅读量282

点赞数 1

文章标签：大数据 mapreduce hadoop

本文链接：https://blog.csdn.net/AnswerLinWJ/article/details/115084710

版权

MapReduce

创建map类继承Mapper类，重写map（）方法
创建reduce类继承Reducer类，重写reduce（）方法
创建主方法类继承Configured类实现Tool接口，重写run（）方法，添加main（）方法

WordCount案例

Map类

package com.bigdata.mapreduce;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/*
 四个泛型：
    KEYIN：k1的类型
    VALUEIN:v1的类型

    KEYOUT：k2的类型
    VALUEOUT:v2 的类型
 */
public class wordCountMapper extends
        Mapper<LongWritable,Text, Text,LongWritable> {


    //map将k1，v1转换为k2，v2
    /* key ： 行偏移量
     value ：每一行的文本数据
     context ：上下文对象
     */
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        Text text = new Text();
        LongWritable longWritable = new LongWritable();
        //将一行的文本数据进行拆分
        String[] split = value.toString().split(",");
        //遍历数组，组装k2和v2
        for (String word : split) {
            text.set(word);
            longWritable.set(1);
            //将k2和v2写入上下文中
//            context.write(new Text(word),new LongWritable(1));
            context.write(text,longWritable);
        }
    }
}

Reduce类

package com.bigdata.mapreduce;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;


import java.io.IOException;

/*
    四个泛型：
        KEYIN:k2类型
        VALUEIN:V2类型

        KEYOUT:k3类型
        VALUEOUT:v3类型
 */
public class wordCountReduce extends
        Reducer<Text, LongWritable, Text,LongWritable> {

    //将K2,V2转为K3,V3  将K3,V3写入上下文中
    /*
     参数：
        key：K2
        value：V2
        context：上下文对象
     */
    @Override
    protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
        long count = 0;
        //遍历集合，将集合中数据相加
        for (LongWritable value : values){
            count += value.get();
        }
        context.write(key,new LongWritable(count));


    }
}

main

package com.bigdata.mapreduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.net.URI;

public class jobmain extends Configured implements Tool {

    //该方法用于指定一个job任务
    @Override
    public int run(String[] strings) throws Exception {
        //创建一个job任务对象
        Job job = Job.getInstance(super.getConf(), "wordcount");

        //当jar无法运行时
        job.setJarByClass(jobmain.class);
        //配置job任务对象
        //1 指定读取方式和读取路径
        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(job,new Path("file:///D:\\input"));
        //file:/// 此写法为本地模式
        //hdfs:// 为集群模式

        //2 指定map阶段的代码
        job.setMapperClass(wordCountMapper.class);
        //设置map阶段K2的类型
        job.setMapOutputKeyClass(Text.class);
        //设置map阶段V2的类型
        job.setMapOutputValueClass(LongWritable.class);

        //3、4、5、6 shuffle 采取默认方式

        //7 Reduce 阶段 指定reduce阶段的处理方式和数据类型
        job.setReducerClass(wordCountReduce.class);
        //设置K3类型
        job.setOutputKeyClass(Text.class);
        //设置V3类型
        job.setOutputValueClass(LongWritable.class);

        //8 设置输出类型
        job.setOutputFormatClass(TextOutputFormat.class);
        //设置输出路径
        Path path = new Path("file:///D:\\output");
        TextOutputFormat.setOutputPath(job,path);
        //获取FileSystem
//        FileSystem fileSystem = FileSystem.get(new URI("file:///D:\\output"), new Configuration());
//        //判断目录是否存在
//        boolean bol = fileSystem.exists(path);
//        if(bol){
//            //删除目录
//            fileSystem.delete(path,true);
//        }

        //等待任务结束
        boolean b1 = job.waitForCompletion(true);
        return b1 ? 0 : 1;
    }

    public static void main(String[] args) throws Exception {
        Configuration entries = new Configuration();
        //启动job任务
        int run = ToolRunner.run(entries, new jobmain(), args);
        System.exit(run);


    }
}

分区

创建partition类继承partitioner类，两个泛型K2,V2类型，重写getpartition（）方法
getpartition方法，定义分区规则，返回对应的分区编号（Int）
主类中，在第4步指定分区类

job.setPartitionerClass(partition.class);

分区案例

数据格式：

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-0H3ookUW-1616402265944)(C:\Users\13918\AppData\Roaming\Typora\typora-user-images\image-20210214101957340.png)]

Mapper类

将一整行数据作为K2，没有value值，但需要返回一个value所以使用NullWritable.get()作为value占位,以谁作分区就应该把它包含在K2的字段里

package com.bigdata.mapreduce.partition;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class p_mapper extends
        Mapper<LongWritable, Text,Text, NullWritable> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        context.write(value,NullWritable.get());
    }
}

Partitioner类

重写getParition方法参数为K2,V2

定义分区规则
返回对应分区编号

package com.bigdata.mapreduce.partition;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

/*
    两个泛型：
        KEY:K2类型
        VALUE:V2类型
    重写方法：getPartition
 */
public class p_partitioner extends Partitioner<Text, NullWritable> {

    /*
        1. 定义分区规则
        2. 返回对应的分区编号

        按照原数据第7列，将大于15的放在一个分区
        其余的放在另一个分区

        根据map操作可知，text为一行文本数据
     */
    @Override
    public int getPartition(Text text, NullWritable nullWritable, int i) {
        //1. 拆分行文本数据获取原数据第7列,按“\t”分割下标为5
        String num = text.toString().split("\t")[5];
        //2. 判断数据与15的关系，返回对应的分区编号
        if(Integer.parseInt(num) > 15){
            return 1;
        }else{
            return 0;
        }
    }
}

Reduce类

无需做任何操作，将数据向后传递即可

package com.bigdata.mapreduce.partition;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class p_reducer extends
        Reducer<Text, NullWritable,Text,NullWritable> {
    @Override
    protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
        context.write(key,NullWritable.get());
    }
}

主类

继承Configured类实现Tool接口，重写run方法，增加main方法。

main方法中启动job任务，使用ToolRunner.run()方法

run方法，3个步骤

创建job任务对象
job任务配置

8个步骤
1. 设置输入类和输入路径
2. 设置Mapper类和数据类型(K2,V2)
3. 分区（设置reducecTask的个数）
4. 排序
5. 规约
6. 分组
7. 设置Reduce类和数据类型（K3,V3）
8. 设置输出类和输出路径
等待任务结束

代码

package com.bigdata.mapreduce.partition;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class p_job extends Configured implements Tool {
    @Override
    public int run(String[] strings) throws Exception {
        //1. 创建一个job任务对象
        /*
            两个参数
                1. Configuration对象，可从父类中获得（同一个任务Configuration对象要是同一个）
                2. 任务名称，自己设置
         */
        Job job = Job.getInstance(super.getConf(), "partitions");

        //2. 对job任务进行配置（八个步骤）

            //1. 设置输入类和输入路径
            job.setInputFormatClass(TextInputFormat.class);
            TextInputFormat.addInputPath(job,new Path("hdfs://主机名:端口号/目录"));
            //2. 设置Mapper类和数据类型(K2,V2)
            job.setMapperClass(p_mapper.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(NullWritable.class);
            //3. 分区
            job.setPartitionerClass(p_partitioner.class);
            //4.排序 5.规约 6. 分组

            //7.设置Reduce类和数据类型（K3,V3）
            job.setReducerClass(p_reducer.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(NullWritable.class);

            //设置reducecTask的个数,不进行设置默认为0个分区
            //分区个数按照需求进行设置
            job.setNumReduceTasks(2);

            //8. 设置输出类和输出路径
            job.setOutputFormatClass(TextOutputFormat.class);
            TextOutputFormat.setOutputPath(job,new Path("hdfs://主机名:端口号/目录"));
        //3. 等待任务结束
        boolean falg = job.waitForCompletion(true);

        return falg ? 0 : 1 ;
    }

    public static void main(String[] args) throws Exception {
        //这个Configuration会保存到Configured的父类中
        Configuration entries = new Configuration();
        //启动job任务
        /*
            三个参数
            1. Configuration对象
            2. Tool实现对象，使用nwe一个主类对象即可
            3. 参数，使用args即可
         */
        int run = ToolRunner.run(entries, new p_job(), args);
        //返回值run为任务执行状态，
        System.exit(run);
    }
}

MapReduce知识点（1）
MapReduce知识点（2）
MapReduce知识点（3）
MapReduce知识点（4）

_知多少_

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
mapReduce知识点（1）

MapReduce创建map类继承Mapper类，重写map（）方法创建reduce类继承Reducer类，重写reduce（）方法创建主方法类继承Configured类实现Tool接口，重写run（）方法，添加main（）方法WordCount案例Map类package com.bigdata.mapreduce;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import or
复制链接

扫一扫