【Hadoop】——MapReduce：序列化

MapReduce自定义序列化

原创已于 2022-01-20 22:28:27 修改 · 2.1k 阅读

10 ·

CC 4.0 BY-SA版权

文章标签：

#hadoop #mapreduce #大数据

于 2022-01-20 22:25:19 首次发布

bigdata 同时被 2 个专栏收录

39 篇文章

订阅专栏

hadoop

9 篇文章

订阅专栏

1. 什么是序列化

<1>序列化就是把内存中的对象，转换成字节序列（或其他数据传输协议）以便于存储到磁盘（持久化）和网络传输。
<2>反序列化就是将收到字节序列（或其他数据传输协议）或者是磁盘的持久化数据，转换成内存中的对象。

2. 优势

<1>结构紧凑，存储空间少
<2>传输快速
<3>互操作性

3. 自定义bean对象序列化步骤

<1>实现Writable接口
在这里插入图片描述
<2>无参构造函数
反序列化时，需要反射调用空参数构造函数，所以必须有无参构造函数

<3>重写序列化方法
在这里插入图片描述

<4>重写反序列化方法
在这里插入图片描述

<5>反序列化的顺序和序列化的顺序要完全一致
<6>重写toString()
要想把结果显示在文件中，需要重写toString()，可以用”\t”分开，方便后续用
在这里插入图片描述

<7>如果需要将自定义的bean放在key中传输，则还需要实现Comparable接口，因为MapReduce框架中的Fhuffle过程要求对key必须能排序
在这里插入图片描述

4. 序列化实例

<1>需求分析

统计
每个手机号耗费的总上行流量，总下行流量，总流量
输入数据
phone_data.txt
输入数据格式
期望输出的数据格式

<2>过程分析

1. Map阶段

<1>读取一行数据，切分字段
<2>抽取手机号，上行流量，下行流量
<3>以手机号为key，bean对象为value输出，即context.write(手机号，bean);
<4> bean对象要想要能够传输，必须实现序列化接口

2. Reduce阶段

<1>累加上行流量和下行流量得到总流量

<3>编写程序

1. FlowBean

package com.demo.mapreduce.writable;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * The type Flow bean.
 * 1. 定义类实现Writable接口
 * 2. 重写序列化反序列化接口
 * 3. 重写无参构造函数
 * 4. 重写toString方法
 */
public class FlowBean implements Writable {
    private long upFlow;//上行流量
    private long downFlow;//下行流量
    private long sumFlow;//总流量

    //无参构造方法
    public FlowBean() {
    }

    //序列化方法
    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeLong(upFlow);
        dataOutput.writeLong(downFlow);
        dataOutput.writeLong(sumFlow);
    }

    //反序列化方法
    @Override
    public void readFields(DataInput dataInput) throws IOException {
        this.upFlow = dataInput.readLong();
        this.downFlow = dataInput.readLong();
        this.sumFlow = dataInput.readLong();
    }

    //toString方法
    @Override
    public String toString() {
        return upFlow + "\t" + downFlow + "\t" + sumFlow;
    }

    //------------get set 方法

    public long getUpFlow() {
        return upFlow;
    }

    public void setUpFlow(long upFlow) {
        this.upFlow = upFlow;
    }

    public long getDownFlow() {
        return downFlow;
    }

    public void setDownFlow(long downFlow) {
        this.downFlow = downFlow;
    }

    public long getSumFlow() {
        return sumFlow;
    }

    public void setSumFlow(long sumFlow) {
        this.sumFlow = sumFlow;
    }

    public void setSumFlow() {
        this.sumFlow = this.upFlow + this.downFlow;
    }
}

2. FlowMapper

package com.demo.mapreduce.writable;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class FlowMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
    private Text outK = new Text();
    private FlowBean outV = new FlowBean();

    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, FlowBean>.Context context) throws IOException, InterruptedException {

        //1. 获取一行数据
        String line = value.toString();

        //2. 根据\t切分数据
        String[] paramArray = line.split("\t");

        //3. 抓取想要的数据 ,手机号，上行流量，下行流量
        String phoneNumber = paramArray[1];
        String upFlow = paramArray[paramArray.length - 3];
        String downFlow = paramArray[paramArray.length - 2];

        //4.封装
        outK.set(phoneNumber);
        outV.setUpFlow(Long.parseLong(upFlow));
        outV.setDownFlow(Long.parseLong(downFlow));
        outV.setSumFlow();

        //5. 写出
        context.write(outK, outV);

    }
}

3. FlowReducer

package com.demo.mapreduce.writable;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class FlowReducer extends Reducer<Text, FlowBean, Text, FlowBean> {
    private FlowBean outV = new FlowBean();

    @Override
    protected void reduce(Text key, Iterable<FlowBean> values, Reducer<Text, FlowBean, Text, FlowBean>.Context context) throws IOException, InterruptedException {

        long totalUp = 0;
        long totalDown = 0;
        long totalSum = 0;

        //1. 循环遍历集合，累加
        for (FlowBean value : values) {
            totalUp += value.getUpFlow();
            totalDown += value.getDownFlow();
            totalSum += value.getSumFlow();

        }

        //2. 封装
        outV.setUpFlow(totalUp);
        outV.setDownFlow(totalDown);
        outV.setSumFlow(totalSum);

        //3. 写出
        context.write(key, outV);
    }
}

4. FlowDriver

package com.demo.mapreduce.writable;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class FlowDriver {
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        //1. 获取job
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        //2. 设置jar包路径
        job.setJarByClass(FlowDriver.class);

        //3. 关联mapper和reducer
        job.setMapperClass(FlowMapper.class);
        job.setReducerClass(FlowReducer.class);

        //4. 设置map的输出的key 和 value 类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(FlowBean.class);

        // 5. 设置最终输出的key 和 value 的类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(FlowBean.class);

        // 6. 指定job的输入原始文件所在目录
        FileInputFormat.setInputPaths(job, new Path("E:\\javaworkspaces\\MapReduceDemo\\phone_data.txt"));

        //7.指定job的输出结果所在目录（不能提前存在）
        FileOutputFormat.setOutputPath(job, new Path("E:\\javaworkspaces\\MapReduceDemo\\phone_data_result"));

        // 8.提交作业
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);
    }
}

5. phone_data.txt

数据源

1	13736230513	192.196.100.1	www.atguigu.com	2481	24681	200
2	13846544121	192.196.100.2			264	0	200
3 	13956435636	192.196.100.3			132	1512	200
4 	13966251146	192.168.100.1			240	0	404
5 	18271575951	192.168.100.2	www.atguigu.com	1527	2106	200
6 	84188413	192.168.100.3	www.atguigu.com	4116	1432	200
7 	13590439668	192.168.100.4			1116	954	200
8 	15910133277	192.168.100.5	www.hao123.com	3156	2936	200
9 	13729199489	192.168.100.6			240	0	200
10 	13630577991	192.168.100.7	www.shouhu.com	6960	690	200
11 	15043685818	192.168.100.8	www.baidu.com	3659	3538	200
12 	15959002129	192.168.100.9	www.atguigu.com	1938	180	500
13 	13560439638	192.168.100.10			918	4938	200
14 	13470253144	192.168.100.11			180	180	200
15 	13682846555	192.168.100.12	www.qq.com	1938	2910	200
16 	13992314666	192.168.100.13	www.gaga.com	3008	3720	200
17 	13509468723	192.168.100.14	www.qinghua.com	7335	110349	404
18 	18390173782	192.168.100.15	www.sogou.com	9531	2412	200
19 	13975057813	192.168.100.16	www.baidu.com	11058	48243	200
20 	13768778790	192.168.100.17			120	120	200
21 	13568436656	192.168.100.18	www.alibaba.com	2481	24681	200
22 	13568436656	192.168.100.19			1116	954	200