MapReduce4

小胖超凶哦！

已于 2022-04-09 15:07:59 修改

阅读量1k

点赞数 1

分类专栏： hadoop基础初学大数据文章标签： hadoop

于 2022-03-27 20:48:50 首次发布

本文链接：https://blog.csdn.net/ZZJXP/article/details/123758473

版权

初学大数据同时被 2 个专栏收录

158 篇文章 1 订阅

订阅专栏

hadoop基础

10 篇文章 0 订阅

订阅专栏

package com.shujia.MapReduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class Demo05SumScore {
    //map端
    public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
            //1500100001,1000001,98
            String[] splits = value.toString().split(",");
            String id = splits[0];
            int score = Integer.parseInt(splits[2]);
            //以学生id作为key，score作为value
            context.write(new Text(id),new IntWritable(score));
        }
    }

    //Reduce端
    public static class MyReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
            //key 学生id
            //values 每个学生的六门科目成绩
         int sum =0;//记录总分
            for (IntWritable score : values) {
                sum+=score.get();
            }
            context.write(key,new IntWritable(sum));
        }
    }

    //Driver端
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://master:9000");
        //创建一个MapReduce的job
        Job job = Job.getInstance(conf);
        //配置任务
        job.setJobName("Demo05SumScore");
        //设置任务运行哪个类
        job.setJarByClass(Demo05SumScore.class);

        //配置map端
        //指定map运行时哪一个类
        job.setMapperClass(MyMapper.class);
        //配置Map端输出的key类型
        job.setMapOutputKeyClass(Text.class);
        //配置Map端输出的value类型
        job.setMapOutputValueClass(IntWritable.class);

        //配置Reduce端
        //指定Reduce运行时哪一个类
        job.setReducerClass(MyReducer.class);
        //配置Reduce端输出的key类型
        job.setOutputKeyClass(Text.class);
        //配置Reduce端输出的value类型
        job.setOutputValueClass(IntWritable.class);

        //配置输入输出路径
        FileInputFormat.addInputPath(job,new Path("/data/score/input"));
        Path path = new Path("/data/sumScore/output");
        FileSystem fs = FileSystem.get(conf);
        //判断输出路径是否存在，存在则删除
        if (fs.exists(path)){
            fs.delete(path,true);
        }
        //输出路径已存在，会报错
        FileOutputFormat.setOutputPath(job,path);

        //等待任务完成
        job.waitForCompletion(true);
    }
}

package com.shujia.MapReduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class Demo05SumScore {
    //map端
    public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
            //1500100001,1000001,98
            String[] splits = value.toString().split(",");
            String id = splits[0];
            int score = Integer.parseInt(splits[2]);
            //以学生id作为key，score作为value
            context.write(new Text(id),new IntWritable(score));
        }
    }

    //Combiner端 发生在Map端的Reduce
    public static class MyCombiner extends Reducer<Text,IntWritable,Text,IntWritable> {
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
            //key 学生id
            //values 每个学生的六门科目成绩
         int sum =0;//记录总分
            for (IntWritable score : values) {
                sum+=score.get();
            }
            context.write(key,new IntWritable(sum));
        }
    }

    //Reduce端
    public static class MyReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
            //key 学生id
            //values 每个学生的六门科目成绩
            int sum =0;//记录总分
            for (IntWritable score : values) {
                sum+=score.get();
            }
            context.write(key,new IntWritable(sum));
        }
    }

    //Driver端
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://master:9000");
        //创建一个MapReduce的job
        Job job = Job.getInstance(conf);
        //配置任务
        job.setJobName("Demo05SumScore");
        //设置任务运行哪个类
        job.setJarByClass(Demo05SumScore.class);

        //配置map端
        //指定map运行时哪一个类
        job.setMapperClass(MyMapper.class);
        //配置Map端输出的key类型
        job.setMapOutputKeyClass(Text.class);
        //配置Map端输出的value类型
        job.setMapOutputValueClass(IntWritable.class);

        //配置Combiner
        job.setCombinerClass(MyCombiner.class);

        //配置Reduce端
        //指定Reduce运行时哪一个类
        job.setReducerClass(MyReducer.class);
        //配置Reduce端输出的key类型
        job.setOutputKeyClass(Text.class);
        //配置Reduce端输出的value类型
        job.setOutputValueClass(IntWritable.class);

        //配置输入输出路径
        FileInputFormat.addInputPath(job,new Path("/data/score/input"));
        Path path = new Path("/data/sumScore/output");
        FileSystem fs = FileSystem.get(conf);
        //判断输出路径是否存在，存在则删除
        if (fs.exists(path)){
            fs.delete(path,true);
        }
        //输出路径已存在，会报错
        FileOutputFormat.setOutputPath(job,path);

        //等待任务完成
        job.waitForCompletion(true);
    }
}


package com.shujia.MapReduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class Demo06MySort {
    //读取sumScore总分数据 做排序 并输出
    //map端
public static class MyMapper extends Mapper<LongWritable, Text,KeySort,NullWritable>{
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, KeySort, NullWritable>.Context context) throws IOException, InterruptedException {
            String[] splits = value.toString().split("\t");
            String id = splits[0];
            int sumScore = Integer.parseInt(splits[1]);
            KeySort keySort = new KeySort(id, sumScore);
            //因为不需要做任何计算，所以不需要Reduce
            context.write(keySort,NullWritable.get());
        }
    }

    //Driver端
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://master:9000");
        //创建一个MapReduce的job
        Job job = Job.getInstance(conf);
        //配置任务
        job.setJobName("Demo06MySort");
        //设置任务运行哪个类
        job.setJarByClass(Demo06MySort.class);

        //配置map端
        //指定map运行时哪一个类
        job.setMapperClass(MyMapper.class);
        //配置Map端输出的key类型
        job.setMapOutputKeyClass(KeySort.class);
        //配置Map端输出的value类型
        job.setMapOutputValueClass(NullWritable.class);

        //如果没有Reduce任务，可以设置为0，否则会默认启动一个Reduce任务
        //虽然不需要Reduce任务做聚合，但是如果没有Reduce任务就不会产生shuffle
        //没有shuffle就没有排序
        //job.setNumReduceTasks(0);

        //配置输入输出路径
        FileInputFormat.addInputPath(job,new Path("/data/sumScore/output"));
        Path path = new Path("/data/mySort/output");
        FileSystem fs = FileSystem.get(conf);
        //判断输出路径是否存在，存在则删除
        if (fs.exists(path)){
            fs.delete(path,true);
        }
        //输出路径已存在，会报错
        FileOutputFormat.setOutputPath(job,path);

        //等待任务完成
        job.waitForCompletion(true);
    }
    /*
    hadoop jar hadoop-1.0-SNAPSHOT.jar com.shujia.MapReduce.Demo06MySort
     */
}

//自定义排序类
class KeySort implements WritableComparable<KeySort> {
    String id;
    int sumScore;

    public KeySort() {
    }

    public KeySort(String id, int sumScore) {
        this.id = id;
        this.sumScore= sumScore;
    }

    @Override
    public void readFields(DataInput in) throws IOException {
     id =in.readUTF();
     sumScore=in.readInt();
    }

    //自定义排序规则
    @Override
    public int compareTo(KeySort o) {
        //先按总分降序 总分相同是按id降序
        int i = this.sumScore - o.sumScore;
        if (i < 0) {
            return 1;
        } else if (i > 0) {
            return -1;
        } else {
            //当分数相等时
            return this.id.compareTo(o.id);
        }
    }

    @Override
    public void write(DataOutput out) throws IOException {
            out.writeUTF(id);
            out.writeInt(sumScore);
        }

    @Override
    public String toString() {
        return id+","+sumScore;
    }
}

package com.shujia.MapReduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Hashtable;

public class Demo04MapJoin {
    //Map端
    public static class MyMapper extends Mapper<LongWritable, Text,Text,Text> {
        //初始化再使用
        Hashtable<String,String> stuKV=new Hashtable<>();
        //每个MapTask启动的时候会执行一次
        @Override
        protected void setup(Mapper<LongWritable, Text, Text, Text>.Context context) throws IOException, InterruptedException {
         //获取小表的数据 并缓存到MapTask的内存当中
            //通过context可以获取广播的小表的路径
            URI[] cacheFiles = context.getCacheFiles();
            //获取小表路径
            String path = cacheFiles[0].toString();
            //使用原生的HDFS JAVA API 加载小表的数据
            FileSystem fs = FileSystem.get(context.getConfiguration());
            FSDataInputStream fsDataInputStream = fs.open(new Path(path));
            BufferedReader br = new BufferedReader(new InputStreamReader(fsDataInputStream));
            String line;
            //为了方便做关联 需要选择合适的数据结构
            //HashTable
            while((line=br.readLine())!=null){
                String id =line.split(",")[0];
                //以id作为key，line作为value 存入HashTable
                stuKV.put(id,line);
            }
        }

        @Override
        //主要处理大表的数据
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context) throws IOException, InterruptedException {
            String[] splits = value.toString().split(",");
            String id = splits[0];
            String subjectId=splits[1];
            String subjectScore=splits[2];
            //通过id去HashTable中获取学生信息数据，以此实现关联
            String stuInfo = stuKV.getOrDefault(id,"");
            //避免未关联上导致索引越界
            if(!"".equals(stuInfo)){
                String[] stuSplits=stuInfo.split(",");
                if(stuSplits.length>=5){
                    String name= stuInfo.split(",")[1];
                    String clazz=stuInfo.split(",")[4];
                    context.write(new Text(id),new Text(name+','+clazz+","+subjectId+","+subjectScore));
                }
            }
        }
    }

    //Driver端
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException {
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://master:9000");
        //指定分隔符为逗号
        conf.set("mapred.textoutputformat.separator",",");
        //创建一个MapReduce的job
        Job job = Job.getInstance(conf);
        //配置任务
        job.setJobName("Demo04MapJoin");
        //设置任务运行哪个类
        job.setJarByClass(Demo03Join.class);

        //配置map端
        //指定map运行时哪一个类
        job.setMapperClass(Demo04MapJoin.MyMapper.class);
        //配置Map端输出的key类型
        job.setMapOutputKeyClass(Text.class);
        //配置Map端输出的value类型
        job.setMapOutputValueClass(Text.class);

        //不需要Reduce任务，不设置默认会启动一个Reduce任务
        job.setNumReduceTasks(0);

        //配置输入输出路径
        FileInputFormat.addInputPath(job,new Path("/data/score/input"));

        //把文件看成一张表，广播小表
        job.addCacheFile(new URI("hdfs://master:9000/data/stu/input/students.txt"));

        Path path = new Path("/data/mapJoin/output");
        FileSystem fs = FileSystem.get(conf);
        //判断输出路径是否存在，存在则删除
        if (fs.exists(path)){
            fs.delete(path,true);
        }
        //输出路径已存在，会报错
        FileOutputFormat.setOutputPath(job,path);

        //等待任务完成
        job.waitForCompletion(true);
    }
    /**
     * hadoop jar hadoop-1.0-SNAPSHOT.jar com.shujia.MapReduce.Demo04MapJoin
     */
}