day12mapReduce

  1. shuuffle:分区,排序,combiner,分组

一.MapReduce

  1. 源码分析
    fileinputformat
for(bytesRemaining (剩余)= length; (double)bytesRemaining / (double)splitSize > 1.1D; bytesRemaining -= splitSize) {
                                blkIndex = this.getBlockIndex(blkLocations, length - bytesRemaining);
                                splits.add(this.makeSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
                            }

在这里插入图片描述
面试:mapreduce切片的时候会判断剩余的部分处以切片大小>1.1则会开辟新的maptask

二.分区练习(明确知道分区数量,不知道分区数量:reduceTask设置为最大:=数据量/一个reduce最多处理的数据量)

1.pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.7.5</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.7.5</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>2.7.5</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>2.7.5</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
        </dependency>
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-simple</artifactId>
            <version>1.7.25</version>
        </dependency>
    </dependencies>

    <groupId>com.hlzq</groupId>
    <artifactId>day10</artifactId>
    <version>1.0-SNAPSHOT</version>


</project>

在这里插入图片描述
分区少的报错原因:
在这里插入图片描述

package com.hlzq;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

public class Jobmain {
    public static void main(String[] args) throws IOException, URISyntaxException, InterruptedException, ClassNotFoundException {
        //1创建一个job任务对象
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration, "mypartioner_covid19");
        //2.指定job所在jar包
        job.setJarByClass(Jobmain.class);
        //3.制定源文件的读取方式和源文件的读取路径
        job.setInputFormatClass(TextInputFormat.class);//按照行读取


        TextInputFormat.addInputPath(job,new Path("file:///G:\\input\\us-covid19-counties.dat"));//指定源文件所在目录
        //4.指定自定义的mapper类,和k2,v2类型
        job.setMapperClass(MyPartiMapper.class);//指定mapper累
        job.setMapOutputKeyClass(Text.class);//k2类型
        job.setMapOutputValueClass(NullWritable.class);//v2类型
        //5.指定自定义分区(如果有的话)
        job.setPartitionerClass(MyParti.class);
        //设置reduce个数默认只有一个
        job.setNumReduceTasks(55);
        //6.指定自定义分组类(如果有的话)
        //7.指定自定义的reduce类和k3v3的数据类型
        job.setReducerClass(MyPartiReducer.class);//指定reduce累
        job.setOutputKeyClass(Text.class);//k3
        job.setOutputValueClass(NullWritable.class);//v3

        Path outputPath = new Path("file:///G:\\\\output");
        FileSystem fileSystem = FileSystem.get(new URI("file:///"), new Configuration());

        boolean flag=fileSystem.exists(outputPath);
        if (flag){
            fileSystem.delete(outputPath,true);
        }

        //8.指定输出方式类和结果输出路径
        job.setOutputFormatClass(TextOutputFormat.class);

        TextOutputFormat.setOutputPath(job,outputPath);


        //9.将job提交给样集群
        boolean bl=job.waitForCompletion(true);//true表示可以看到任务的执行进度

        //10.退出执行进程
        System.exit(bl?0:1);




    }
}
package com.hlzq;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class MyPartiMapper extends Mapper<LongWritable, Text,Text, NullWritable> {
//  ki 0   v2:1	0	1	2017-07-31 23:10:12	837255	6	4+1+1=6	小,双	0	0.00	0.00	1	0.00	1	1
//  k21	0	1	2017-07-31 23:10:12	837255	6	4+1+1=6	小,双	0	0.00	0.00	1	0.00	1	1   v2:nullwritable
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
      context.write(value,NullWritable.get());
    }
}
package com.hlzq;
//自定义分区规则:集成partitioner类,重写getPartioner
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

import java.util.HashMap;


public class MyParti  extends Partitioner<Text, NullWritable> {
    private static int index = -1;
    private static   HashMap<String, Integer> map = new HashMap<String, Integer>();//string周明 integer 分区编号

    //k2 :Text text,v2: NullWritable nullWritable,rduce个数: int i在jobmain设这
    public static int getPar(String stateName) {//传人洲名

        //判断map中的key是否存在,不存在添加,存在获取value
        boolean b = map.containsKey(stateName);
        if (!b) {
            map.put(stateName, ++index);//map中键值对不能重复
            return index;
        } else {
            return map.get(stateName);
        }

    }

    @Override
    public int getPartition(Text text, NullWritable nullWritable, int i) {
      //拆分行文本数据获取州
        String[] split = text.toString().split(",");
        int par = getPar(split[2]);
        return par;
    }
}
package com.hlzq;


import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class MyPartiReducer extends Reducer <Text, NullWritable,Text,NullWritable>{
    @Override
    protected void reduce(Text key, Iterable<NullWritable> values, Reducer<Text, NullWritable, Text, NullWritable>.Context context) throws IOException, InterruptedException {
        context.write(key,NullWritable.get());
    }
}

三.默认分区

  1. 哈希值
    在这里插入图片描述
  2. 大量的key相同则拼接时间戳 (1970年)
  3. 为了避免分区编号变为负数与21471483647在这里插入图片描述

四.排序与序列化

1.序列化:结构化对象转换为字节流在这里插入图片描述
Java中是通过实现接口Serializable.

package com.hlzq;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
//如果在mapReduce中使用Javabean,则该Javabean类必须能够被序列化

public class Student implements Writable {
    private  String  name;
    private  int age;

    public Student() {
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public int getAge() {
        return age;
    }

    public void setAge(int age) {
        this.age = age;
    }
//序列化写
    @Override
    public void write(DataOutput out) throws IOException {
   out.writeUTF(name);
   out.writeInt(age);
    }
//反序列化读,序列化顺序必须和反序列化一致
    @Override
    public void readFields(DataInput in) throws IOException {
     this.name=in.readUTF();
     this.age=in.readInt();
    }
}
  1. 排序:指定排序规则,没有则按照地址值。
package com.hlzq;

import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**如果在mapReduce中使用Javabean,则该Javabean类必须能够被序列化
如果在mapReduce中使用Javabean,想让Javabean作为k2,而且自定义排序规则,则需要实现WritableComparable,则该类也可以被序列化
public interface WritableComparable<T> extends Writable, Comparable<T> {
}*/
public class Student2 implements WritableComparable <Student2>{
    private  String  name;
    private  int age;

    public Student2() {
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public int getAge() {
        return age;
    }

    public void setAge(int age) {
        this.age = age;
    }
//序列化写
    @Override
    public void write(DataOutput out) throws IOException {
   out.writeUTF(name);
   out.writeInt(age);
    }
//反序列化读,序列化顺序必须和反序列化一致
    @Override
    public void readFields(DataInput in) throws IOException {
     this.name=in.readUTF();
     this.age=in.readInt();
    }
//比较器,设置规则大于0,小于0,=0;
    @Override
    public int compareTo(Student2 o) {
        int result=this.getAge()-o.getAge();
        return result;
    }
}
  1. 例子:数据格式如下
  a   1
  a   9
  b   3
  a   7
  b   8
  b   10
  a   5


  a   1
  a   5
  a   7
  a   9
  b   3
  b   8
  b   10

要求:
第一列按照字典顺序进行排列
第一列相同的时候, 第二列按照升序进行排列
在这里插入图片描述

package com.hlzq.sort;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class SortBean implements WritableComparable<SortBean> {
    private  String word;
    private int num;

    @Override
    public String toString() {
        return
                 word + '\t' + num ;
    }

    public String getWord() {
        return word;
    }

    public void setWord(String word) {
        this.word = word;
    }

    public int getNum() {
        return num;
    }

    public void setNum(int num) {
        this.num = num;
    }

    @Override
    //排序算法快排和归并排序
    public int compareTo(SortBean o) {
        int i = this.word.compareTo(o.getWord());
        if (i==0){
            return  this.getNum()-o.getNum();
        }else
        return i;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(word);
        out.writeInt(num);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
      this.word=in.readUTF();
      this.num=in.readInt();
    }
}
package com.hlzq.sort;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class SortBeanMapper extends Mapper<LongWritable, Text,SortBean, NullWritable> {
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, SortBean, NullWritable>.Context context) throws IOException, InterruptedException {
        //1.拆分v1de 行文本数据
        String[] split = value.toString().split("\t");
        //2.封装sortbean得到k2
        SortBean sortBean=new SortBean();
        sortBean.setWord(split[0]);
        sortBean.setNum(Integer.parseInt(split[1]));
        //3.v2是nullwritable
        //4.将k2和v2写入上下文中
        context.write(sortBean, NullWritable.get());
    }
}
package com.hlzq.sort;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class SortBeanReducer extends Reducer<SortBean, NullWritable,SortBean,NullWritable> {
    @Override
    protected void reduce(SortBean key, Iterable<NullWritable> values, Reducer<SortBean, NullWritable, SortBean, NullWritable>.Context context) throws IOException, InterruptedException {
        context.write(key,NullWritable.get());
    }
}
package com.hlzq.sort;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

public class Jobmain {
    public static void main(String[] args) throws IOException, URISyntaxException, InterruptedException, ClassNotFoundException {
        //1.创建一个job任务
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration, "mysort");
        //2.指定job所在的jar包
        job.setJarByClass(Jobmain.class);
        //3.指定源文件的读取路径
        job.setInputFormatClass(TextInputFormat.class);//按照行读取
        TextInputFormat.addInputPath(job,new Path("file:///G:\\input"));//文件所在目录
        //4.指定指定以的Mapper类的k2和v2的类型
        job.setMapperClass(SortBeanMapper.class);
        job.setMapOutputKeyClass(SortBean.class);
        job.setMapOutputValueClass(NullWritable.class);
        //5.指定reduce
        job.setReducerClass(SortBeanReducer.class);
        job.setOutputKeyClass(SortBean.class);
        job.setMapOutputValueClass(NullWritable.class);
        //6.输出目录
        Path outputPath = new Path("file:///G:\\\\output");
        FileSystem fileSystem = FileSystem.get(new URI("file:///"), new Configuration());
        boolean flag=fileSystem.exists(outputPath);
        if (flag){
            fileSystem.delete(outputPath,true);
        }
        //8.指定输出方式类和结果输出路径
        job.setOutputFormatClass(TextOutputFormat.class);

        TextOutputFormat.setOutputPath(job,outputPath);


        //9.将job提交给样集群
        boolean bl=job.waitForCompletion(true);//true表示可以看到任务的执行进度

        //10.退出执行进程
        System.exit(bl?0:1);

    }
}

五.Combineer(规约)

  1. 每一个map都可能会产生大量的本地输出,combiner的作用就是对map端的输出先做一次合并,减少在map和reduce节点间数据传输量,提高IO性能,是MapReduce的一种优化手段之一。在这里插入图片描述
  2. 代码
package com.hlzq.mapreduce;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class MyCombiner extends Reducer<Text, LongWritable,Text,LongWritable> {
    @Override
    protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
        long count=0L;
        for (LongWritable value:values){
            count+=value.get();
        }
        //通过context将k3和v3写入上下文
        context.write(key,new LongWritable(count));

    }
}

在jobmain中指定combiner类

job.setCombinerClass(Mycombiner.class);

如果combiner和reduce相同可以不用写combiner

job.setCombinerClass(MyReduce.class);

在这里插入图片描述

六.自定义分组

  1. 分组:默认是相同k2放在集合中
    在这里插入图片描述
  2. 例子:有如下订单数据
    订单id 商品id 成交金额
    Order_0000001 Pdt_01 222.8
    Order_0000001 Pdt_05 25.8
    Order_0000002 Pdt_03 522.8
    Order_0000002 Pdt_04 122.4
    Order_0000002 Pdt_05 722.4
    Order_0000003 Pdt_01 2221.8

现在需要求出每一个订单中成交金额最大的一笔交易(TopN)
在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值