大数据-MapReduce应用二

最新推荐文章于 2023-11-27 12:41:47 发布

JP-Destiny

最新推荐文章于 2023-11-27 12:41:47 发布

阅读量386

点赞数

分类专栏：大数据文章标签：重写InputFormat类重写OutputFormat类 MapReduce的应用

本文链接：https://blog.csdn.net/JavaDestiny/article/details/88832040

版权

大数据专栏收录该内容

62 篇文章 1 订阅

订阅专栏

案例一

student A 06140412 05 08 102 110 106
student B 06140407 02 06 60 98 80
student C 06140404 10 07 98 31 63
student D 06140403 07 10 105 109 107
student E 06140406 03 03 57 87 92
student F 06140408 10 06 102 102 50

案例二

student G 06140402 03 07 54 61 64
student H 06140401 05 03 83 76 111
student I 06140409 05 10 70 56 91
student J 06140411 07 09 22 119 112
student K 06140410 02 01 45 65 80
student L 06140405 03 02 79 20 26

需求：将多个文件合并为SequenceFile（存储多个小文件）

存储方式：文件路径+文件内容

FileInput类

package com.FileInputFormat.util;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import java.io.IOException;
/*
 * @author Administrator
 * @version
 * @task 将多个文件合并为SequenceFile（存储多个小文件）  存储方式：文件路径+文件内容
 */
public class FileInput extends FileInputFormat<NullWritable, BytesWritable> {
    @Override
    public RecordReader<NullWritable, BytesWritable> createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
        RecordReader<NullWritable, BytesWritable> fileRecordReader = new FileRecordReader();
        return fileRecordReader;
    }

    @Override
    protected boolean isSplitable(JobContext context, Path filename) {
        //不切原来的文件
        return false;
    }
}

FileRecordReader类

package com.FileInputFormat.util;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;
/*
 * @author Administrator
 * @version
 * @task 将多个文件合并为SequenceFile（存储多个小文件）  存储方式：文件路径+文件内容
 */
public class FileRecordReader extends RecordReader<NullWritable, BytesWritable> {
    private FileSplit fileSplit = null;
    private Configuration conf = null;
    private boolean flag = false;
    private FSDataInputStream fsDataInputStream = null;
    private FileSystem fileSystem = null;
    private BytesWritable value = new BytesWritable();
    @Override
    public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
        //初始化文件切片
        fileSplit = (FileSplit) inputSplit;
        //初始化配置
        conf = taskAttemptContext.getConfiguration();
    }

    @Override
    public boolean nextKeyValue() {
        if (!flag){
            //根据切片的长度创建缓冲区
            byte[] buf = new byte[(int) fileSplit.getLength()];
            //获取路径
            Path path = fileSplit.getPath();
            try {
                //根据路径获取文件系统
                fileSystem = path.getFileSystem(conf);
                //获取输入流
                fsDataInputStream = fileSystem.open(path);
                //数据拷贝
                IOUtils.readFully(fsDataInputStream,buf,0,buf.length);
                //拷贝缓冲到最终的输出
                value.set(buf,0,buf.length);
            } catch (IOException e) {
                e.printStackTrace();
            }finally {
                //关闭流
                IOUtils.closeStream(fsDataInputStream);
                IOUtils.closeStream(fileSystem);
            }
            flag = true;
            return true;
        }
        return false;
    }

    @Override
    public NullWritable getCurrentKey() throws IOException, InterruptedException {
        return null;
    }

    @Override
    public BytesWritable getCurrentValue() throws IOException, InterruptedException {
        return value;
    }

    @Override
    public float getProgress() throws IOException, InterruptedException {
        return 0;
    }

    @Override
    public void close() throws IOException {

    }
}

FileInputMapper类

package com.FileInputFormat.util;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;
/*
 * @author Administrator
 * @version
 * @task 将多个文件合并为SequenceFile（存储多个小文件）  存储方式：文件路径+文件内容
 */
public class FileInputMapper extends Mapper<NullWritable, BytesWritable, Text,BytesWritable> {
    private FileSplit fileSplit = null;
    private Path path = null;
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        //获取切片信息
        fileSplit = (FileSplit)context.getInputSplit();
        //获取路径
        path = fileSplit.getPath();
    }

    @Override
    protected void map(NullWritable key, BytesWritable value, Context context) throws IOException, InterruptedException {
        //输出到Reducer端
        context.write(new Text(path.toString()),value);
    }
}

FileInpueReducer类

package com.FileInputFormat.util;

import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
/*
 * @author Administrator
 * @version
 * @task 将多个文件合并为SequenceFile（存储多个小文件）  存储方式：文件路径+文件内容
 */
public class FileInpueReducer extends Reducer<Text, BytesWritable,Text, BytesWritable> {
    @Override
    protected void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException, InterruptedException {
        for (BytesWritable bytes: values){
            context.write(key,bytes);
        }
    }
}

FileInputDriver类

package com.FileInputFormat.util;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

import java.io.IOException;
/*
 * @author Administrator
 * @version
 * @task 将多个文件合并为SequenceFile（存储多个小文件）  存储方式：文件路径+文件内容
 */
public class FileInputDriver {
    public static void main(String[] args) {
        //实例化Configuration
        Configuration conf = new Configuration();
        try {
            //创建任务
            Job job = Job.getInstance(conf);
            //指定jar包的位置
            job.setJarByClass(FileInputDriver.class);
            //关联Mapper类
            job.setMapperClass(FileInputMapper.class);
            //关联Reducer类
            job.setReducerClass(FileInpueReducer.class);
            //设置Mapper输出的数据类型
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(BytesWritable.class);
            //设置Reducer输出的数据类型
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(BytesWritable.class);
            //设置自定义读取的方式
            job.setInputFormatClass(FileInput.class);
            job.setOutputFormatClass(SequenceFileOutputFormat.class);
            //设置输入路径
            FileInputFormat.setInputPaths(job,new Path(args[0]));
            //设置输出路径
            FileOutputFormat.setOutputPath(job,new Path(args[1]));
            //提交任务
            boolean b = job.waitForCompletion(true);
            System.exit(b?0:1);
        } catch (IOException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        }
    }
}

案例

student A 06140412 05 08 102 110 106
student B 06140407 02 06 60 98 80
student C 06140404 10 07 98 31 63
student D 06140403 07 10 105 109 107
student E 06140406 03 03 57 87 92
student F 06140408 10 06 102 102 50
student G 06140402 03 07 54 61 64
student H 06140401 05 03 83 76 111
student I 06140409 05 10 70 56 91
student J 06140411 07 09 22 119 112
student K 06140410 02 01 45 65 80
student L 06140405 03 02 79 20 26

需求：过滤数据，把5号考场的学生放到一个文件中，并计算每个学生的总成绩

FileOutput类

package com.FileOutputFormat.util;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
/*
 * @author Administrator
 * @version
 * @task 过滤数据，把5号考场的学生放到一个文件中，并计算每个学生的总成绩
 */
public class FileOutput extends FileOutputFormat<StudentGrade, NullWritable> {

    @Override
    public RecordWriter<StudentGrade, NullWritable> getRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
        RecordWriter<StudentGrade, NullWritable> fileRecordWriter = new FileRecordWriter();
        return fileRecordWriter;
    }
}

FileRecordWriter类

package com.FileOutputFormat.util;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import java.io.IOException;
/*
 * @author Administrator
 * @version
 * @task 过滤数据，把5号考场的学生放到一个文件中，并计算每个学生的总成绩
 */
public class FileRecordWriter extends RecordWriter<StudentGrade, NullWritable> {
    private FSDataOutputStream fsDataOutputStream;
    public FileRecordWriter() {
        //实例化Configuration
        Configuration conf = new Configuration();
        try {
            //获取文件系统
            FileSystem fileSystem = FileSystem.get(conf);
            //获取输出流
            fsDataOutputStream = fileSystem.create(new Path("F:\\IdeaProjects\\out6\\out.log"));
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    @Override
    public void write(StudentGrade studentGrade, NullWritable nullWritable) throws IOException, InterruptedException {
        //转换成String类型
        String str = studentGrade.toString();
        //分割数据
        String[] field = str.split("\t");
        //如果5号考场的学生，则输出
        if ("5".equals(field[2])) {
            fsDataOutputStream.write(str.getBytes());
            fsDataOutputStream.write("\n".getBytes());
        }
    }

    @Override
    public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
        //关闭流
        if (fsDataOutputStream != null){
            fsDataOutputStream.close();
        }
    }
}

StudentGrade类

package com.FileOutputFormat.util;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/*
 * @author Administrator
 * @version
 * @task 过滤数据，把5号考场的学生放到一个文件中，并计算每个学生的总成绩
 */
public class StudentGrade implements WritableComparable<StudentGrade> {
    //学生姓名
    private String name;
    //学号
    private String id;
    //考场号
    private int roomID;
    //语文成绩
    private int chinese;
    //数学成绩
    private int math;
    //英语成绩
    private int english;
    //总成绩
    private int sum;

    //无参构造方法
    public StudentGrade(){}
    //有参构造方法
    public StudentGrade(String name, String id, int roomID, int chinese, int math, int english){
        this.name = name;
        this.id = id;
        this.roomID = roomID;
        this.chinese = chinese;
        this.math = math;
        this.english = english;
        this.sum = chinese + math + english;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public String getId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
    }

    public int getRoomID() {
        return roomID;
    }

    public void setRoomID(int roomID) {
        this.roomID = roomID;
    }

    public int getChinese() {
        return chinese;
    }

    public void setChinese(int chinese) {
        this.chinese = chinese;
    }

    public int getMath() {
        return math;
    }

    public void setMath(int math) {
        this.math = math;
    }

    public int getEnglish() {
        return english;
    }

    public void setEnglish(int english) {
        this.english = english;
    }

    public int getSum() {
        return sum;
    }

    public void setSum(int sum) {
        this.sum = sum;
    }

    @Override
    public int compareTo(StudentGrade object) {
        return Integer.parseInt(this.id) > Integer.parseInt(object.id)? 1:-1;
    }

    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(name);
        dataOutput.writeUTF(id);
        dataOutput.writeInt(roomID);
        dataOutput.writeInt(chinese);
        dataOutput.writeInt(math);
        dataOutput.writeInt(english);
        dataOutput.writeInt(sum);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
        name = dataInput.readUTF();
        id = dataInput.readUTF();
        roomID = dataInput.readInt();
        chinese = dataInput.readInt();
        math = dataInput.readInt();
        english = dataInput.readInt();
        sum = dataInput.readInt();
    }

    @Override
    public String toString() {
        return name + "\t" + id + "\t" + roomID + "\t" + chinese + "\t" + math + "\t" + english + "\t" + sum;
    }
}

FileOutputMapper类

package com.FileOutputFormat.util;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
/*
 * @author Administrator
 * @version
 * @task 过滤数据，把5号考场的学生放到一个文件中，并计算每个学生的总成绩
 */
public class FileOutputMapper extends Mapper<LongWritable, Text,StudentGrade, NullWritable> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //获取行数据
        String line = value.toString();
        //分割数据
        String[] field = line.split("\t");
        //学生姓名
        String name = field[0];
        //学号
        String id = field[1];
        //考场号
        int roomID = Integer.valueOf(field[2]);
        //语文成绩
        int chinese = Integer.valueOf(field[4]);
        //数学成绩
        int math = Integer.valueOf(field[5]);
        //英语成绩
        int english = Integer.valueOf(field[6]);
        //写到Reducer端
        context.write(new StudentGrade(name,id,roomID,chinese,math,english),NullWritable.get());
    }
}

FileOutputReducer类

package com.FileOutputFormat.util;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
/*
 * @author Administrator
 * @version
 * @task 过滤数据，把5号考场的学生放到一个文件中，并计算每个学生的总成绩
 */
public class FileOutputReducer extends Reducer<StudentGrade, NullWritable,StudentGrade,NullWritable> {
    @Override
    protected void reduce(StudentGrade key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
        //写到Driver端
        context.write(key,values.iterator().next());
    }
}

FileOutputDriver类

package com.FileOutputFormat.util;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
/*
 * @author Administrator
 * @version
 * @task 过滤数据，把5号考场的学生放到一个文件中，并计算每个学生的总成绩
 */
public class FileOutputDriver {
    public static void main(String[] args) {
        try {
            //实力化Configuration对象
            Configuration conf = new Configuration();
            Job job = Job.getInstance(conf);
            //指定jar包的位置
            job.setJarByClass(FileOutputDriver.class);
            //关联Mapper类
            job.setMapperClass(FileOutputMapper.class);
            //关联Reducer类
            job.setReducerClass(FileOutputReducer.class);
            //设置Mapper输出的数据类型
            job.setMapOutputKeyClass(StudentGrade.class);
            job.setMapOutputValueClass(NullWritable.class);
            //设置Reducer输出的数据类型
            job.setOutputKeyClass(StudentGrade.class);
            job.setOutputValueClass(NullWritable.class);
            //设置自定义输出的类型
            job.setOutputFormatClass(FileOutput.class);
            //设置输入的路径
            FileInputFormat.setInputPaths(job,new Path(args[0]));
            //设置输出的路径
            FileOutputFormat.setOutputPath(job, new Path(args[1]));
            //执行任务
            boolean b = job.waitForCompletion(true);
            System.exit(b?0:1);
        } catch (IOException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        }
    }
}