MapReduce学习之MapJoin案例实现-CSDN博客

本文链接：https://blog.csdn.net/m0_58050808/article/details/139335208

MapReduce学习之MapJoin案例实现

1.当前main方法所在的入口类

package com.shujia.mr.mapJoin;

import com.shujia.mr.reduceJoin.ReduceJoin;
import com.shujia.mr.reduceJoin.ReduceJoinMapper;
import com.shujia.mr.reduceJoin.ReduceJoinReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.io.FileNotFoundException;
import java.io.IOException;

public class MapJoin {
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        /*
            TODO:
                需求：需要使用Map端对基本信息数据和成绩数据进行关联
                分析：
                    ① 先读取students.txt文件中的数据
                    ② 通过其他方式再读取score.txt中的数据
                问题：
                    由于需要添加两种文件的数据，同时map函数计算时，是按行读取数据的，上一行和下一行之间没有关系
                        于是思路：
                            ① 先读取score.txt中的数据到一个HashMap中
                            ② 之后再将HashMap中的数据和按行读取的Students.txt中的每一行数据进行匹配
                            ③ 将关联的结果再进行写出操作
                        注意：
                            需要在读取students.txt文件之前就将score.txt数据读取到HashMap中
         */

        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "MapJoin");
        job.setJarByClass(MapJoin.class);
        job.setMapperClass(MapJoinMapper.class);


        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        // TODO 4.设置数据的输入和输出路径

        // 本地路径
        FileSystem fileSystem = FileSystem.get(job.getConfiguration());
        Path outPath = new Path("hadoop/out/mapJoin");
        Path studentInpath = new Path("hadoop/data/students.txt");

        // TODO 可以在当前位置将需要在setup函数中获取的路径进行缓存
        job.addCacheFile(new Path("hadoop/out/count/part-r-00000").toUri());


        if (!fileSystem.exists(studentInpath)) {
            throw new FileNotFoundException(studentInpath+"不存在");
        }
        TextInputFormat.addInputPath(job,studentInpath);


        if (fileSystem.exists(outPath)) {
            System.out.println("路径存在，开始删除");
            fileSystem.delete(outPath,true);
        }
        TextOutputFormat.setOutputPath(job,outPath);

        // TODO 5.提交任务开始执行
        job.waitForCompletion(true);

    }
}

2.Map端

package com.shujia.mr.mapJoin;

import jdk.nashorn.internal.runtime.regexp.joni.Config;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;

public class MapJoinMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
    HashMap<String, Integer> scoreHashMap;

    public MapJoinMapper() {
        this.scoreHashMap = new HashMap<>();
    }

    /**
     * 在每个MapTask被执行时，都会先执行一次setup函数，可以用于加载一些数据
     *
     * @param context
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    protected void setup(Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
        /*
            TODO 需要读取 score.txt 中的数据
                如果在本地执行，那么可以通过BufferedReader按行读取数据，如果是在HDFS中获取数据
                    需要通过FileSystem创建IO流进行读取，并且FileSystem也可以读取本地文件系统中的数据
         */
        /*
            TODO 问题：
                ① 对于每个MapTask都需要执行一次 setup 函数，那么当MapTask较多时，每个MapTask都保存一个HashMap的Score数据
                        该数据是保存在内存当中的  于是对于MapJoin有一个使用的前提条件
                    一个大表和一个小表进行关联，其中将小表的数据加载到集合中，大表按行进行读取数据
                    同时小表要小到能保存在内存中，没有内存压力 通常是在 25M-40M以内的数据量
         */

        Configuration configuration = context.getConfiguration();
        FileSystem fileSystem = FileSystem.get(configuration);
        // new Path(filePath).getFileSystem(context.getConfiguration());
        // 通过context中的getCacheFiles获取缓存文件路径
        URI[] files = context.getCacheFiles();
        for (URI filePath : files) {
            FSDataInputStream open = fileSystem.open(new Path(filePath));
//            FSDataInputStream open = fileSystem.open(new Path("hadoop/out/count/part-r-00000"));
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(open));
            String oneScore = null;
            while ((oneScore = bufferedReader.readLine()) != null) {
                String[] column = oneScore.split("\t");
                scoreHashMap.put(column[0], Integer.valueOf(column[1]));
            }

        }
        System.out.println("Score数据加载完成，已存储到HashMap中");
        Configuration configuration1 = context.getConfiguration();
        FileSystem fileSystem1 = FileSystem.get(configuration1);
        URI[] files1 = context.getCacheFiles();
        for(URI fillPath : files1){
            FSDataInputStream open1 = fileSystem1.open(new Path(fillPath));
            BufferedReader bufferedReader1 = new BufferedReader(new InputStreamReader(open1));
            String oneScore1 = null;
            while((oneScore1 = bufferedReader1.readLine()) != null){
                String[] column1 = oneScore1.split("\t");
                scoreHashMap.put(column1[0], Integer.valueOf(column1[1]));
            }

        }


    }

    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
        // 1500100004,葛德曜,24,男,理科三班
        String oneStuInfo = value.toString();

        String[] columns = oneStuInfo.split(",");
        if (columns.length == 5) {
            String id = columns[0];
            // TODO 通过HashMap获取数据，如果没有获取到，那么阁下如何应对？
            Integer score = scoreHashMap.get(id);
            oneStuInfo += (","+score);
            context.write(new Text(oneStuInfo), NullWritable.get());
        }

    }
}