mapreduce

苦瓜大哥

已于 2023-12-26 19:34:13 修改

阅读量982

点赞数 26

文章标签： mapreduce

于 2023-12-26 19:24:10 首次发布

本文链接：https://blog.csdn.net/LJT_love/article/details/135229334

版权

MapReduce的核心思想

将大规模的数据集拆分为多个小规模的数据块，然后并行地对这些数据块进行处理。整个处理过程分为两个阶段：Map阶段和Reduce阶段。

Map阶段

输入的数据被划分为若干个键值对（key-value ），每个键值对经过用户自定义的Map函数进行处理，生成中间结果。

Reduce阶段

中间结果被按照键（key）进行分组，然后将同一键的所有值（values）传递给用户自定义的Reduce函数进行处理，并生成最终的输出结果。

基础数据类型

hadoop数据类型	java数据类型
Text	String
IntWritable	int
LongWritable	long
ByteWritable	byte
BytesWritable	byte[]
DoubleWritable	double
FloatWritable	float
NullWritable	null
ShortWritable	short

自定义数据类型

通过实现Writable接口重写它的write（），readFields（）方法进行数据类型的读写

特别注意:map控制写,reduce控制读,读的顺序与写的顺序必须一致

public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(this.orderId);
        dataOutput.writeUTF(this.orderCreateDate);
        dataOutput.writeUTF(this.userId);
        dataOutput.writeUTF(this.sex);
        dataOutput.writeInt(this.age);
        dataOutput.writeUTF(this.phone);
        dataOutput.writeInt(this.type);


    }

public void readFields(DataInput dataInput) throws IOException {
        this.orderId = dataInput.readUTF();
        this.orderCreateDate = dataInput.readUTF();
        this.userId = dataInput.readUTF();
        this.sex = dataInput.readUTF();
        this.age = dataInput.readInt();
        this.phone = dataInput.readUTF();
        this.type = dataInput.readInt();
    }

mapreduce案例

本案例计算在mapper，没有用reduce

Mapper

public class MapJoinMapper extends Mapper<LongWritable, Text, CombineBean, CombineBean> {
    private Map<String, CombineBean> userCache = null;
    private String fileName;

    /*完成小数据量集的装配->map[String,CombineBean]*/
    /*step1->FileSystem*/
    /*step2->path*/

    @Override
    protected void setup(Mapper<LongWritable, Text, CombineBean, CombineBean>.Context context) throws IOException, InterruptedException {
        //获取FileSplit
        FileSplit inputSplit    = (FileSplit) context.getInputSplit();
        //获取文件路径
        Path path               = inputSplit.getPath();
        //通过上下文配置获取文件系统
        FileSystem fs           = FileSystem.get(context.getConfiguration());
        //通过路径获取文件名
        fileName                = path.getName();

        if (fileName.indexOf("order") < 0) {
            return;
        }
        /*加载用户数据->map*/
        Path userInputPath       = new Path("D:/ideaProjects/hadoop_pro/mapJoin/input/user.data");
        FSDataInputStream in     = fs.open(userInputPath);
        List<String> lines       = IOUtils.readLines(in, Charset.forName("UTF-8"));
        userCache                =new HashMap<String, CombineBean>();
        for (String line : lines) {
            String[] split = line.split(",");
            String userId = split[0];
            userCache.put(userId , new CombineBean(split[0], split[1],
                    Integer.parseInt(split[2]), split[3], 1));
        }

    }

    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, CombineBean, CombineBean>.Context context) throws IOException, InterruptedException {
        if(fileName.contains("order")){
            String line         = value.toString();
            String[] split      = line.split(",");
            String userId       = split[2];

            CombineBean order   = new CombineBean(split[0], split[1], split[2], 0);
            if(userCache.containsKey(userId)){
                context.write(userCache.get(userId) , order);
            }
        }
    }


    @Override//销毁（自己定义）
    protected void cleanup(Mapper<LongWritable, Text, CombineBean, CombineBean>.Context context) throws IOException, InterruptedException {
        super.cleanup(context);
    }
}

Launch

public class MapJoinLaunch {
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        //log4j中日志打印
        BasicConfigurator.configure();
        //通过job实例获取job对象
        Job job = Job.getInstance();
        //设置名字
        job.setJobName("mapJoin");
        //设置启动类
        job.setJarByClass(MapJoinLaunch.class);

        //配置Mapper
        job.setMapperClass(MapJoinMapper.class);
        //配置reduce
        job.setReducerClass(MapJoinReduce.class);

        //配置Mapper（key）输出类型
        job.setMapOutputKeyClass(Text.class);
        //配置Mapper（value）输出类型
        job.setMapOutputValueClass(IntWritable.class);

        //配置key输出类型
        job.setOutputKeyClass(Text.class);
        //配置value输出类型
        job.setOutputValueClass(IntWritable.class);

        //通过job配置获取FileSystem
        FileSystem fs = FileSystem.get(job.getConfiguration());
        //获取文件输出路径
        Path out = new Path("D:/ideaProjects/hadoop_pro/mapJoin/output");
        if (fs.exists(out)) {
            fs.delete(out, true);
        }

        //获取文件输入路径
        FileInputFormat.addInputPath(job, new Path("D:/ideaProjects/hadoop_pro/mapJoin/input"));
        //设置文件输出路径
        FileOutputFormat.setOutputPath(job, out);

        //设置reduce数量
        job.setNumReduceTasks(0);
        //提交job
        job.waitForCompletion(true);
    }
}

input

oreder.data
o_001,2023-11-11,u_001
o_002,2023-11-11,u_002
    
    
user.data
u_001,女,18,18629xxxxxx
u_002,男,18,18629370xxx

output

CombineBean{orderId='', orderCreateDate='', userId='u_001', sex='女', age=18, phone='18629xxxxxx', type=1}   CombineBean{orderId='o_001', orderCreateDate='2023-11-11', userId='u_001', sex='', age=0, phone='', type=0}
CombineBean{orderId='', orderCreateDate='', userId='u_002', sex='男', age=18, phone='18629370xxx', type=1}   CombineBean{orderId='o_002', orderCreateDate='2023-11-11', userId='u_002', sex='', age=0, phone='', type=0}