Hadoop自定义 inputformat 和outputformat 实现图像的读写

最新推荐文章于 2020-08-27 13:42:28 发布

nmingwei

最新推荐文章于 2020-08-27 13:42:28 发布

阅读量1.3k

点赞数 2

分类专栏： hadoop 文章标签： hadoop

本文链接：https://blog.csdn.net/nmingwei/article/details/50967751

版权

hadoop 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

数据输入格式InputFormat用于描述MapReduce作业的数据输入规范。MapReduce框架依靠数据输入格式完成输入规范检查（比如输入文件目录的检查、对数据文件进行输入分块（InputSplit）,从输入分片中将数据记录逐一读出、并转换为Map过程的输入（键值对）。Hadoop 使我们能够实现自定义的InputFormat，从而实现我们自定义的Mapreduce计算。可以实现自定义的InputFormat类，从而更好的控制输入数据，以支持专用的或特定的应用程序的输入数据的文件格式，InputFormat实现应该扩展org.apache.hadoop.mapreduce.InputFormat
抽象类，并重写createRecordReader()和getSplit()方法。
下面将实现基于FlieInputFormat的自定义Inputformat也就是ImageInputFormat，和ImageRecordReader()。

import java.io.IOException;

import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

public class ImageFileInputFormat extends FileInputFormat<Text, BytesWritable> {

    @Override
    protected boolean isSplitable(JobContext context, Path ){
    return false;//保证单个图不被分割
    }

    @Override
    public RecordReader<Text, BytesWritable> createRecordReader(InputSplit arg0,
            TaskAttemptContext arg1) throws IOException, InterruptedException {
        // TODO Auto-generated method stub
        return new ImageRecordReader();
    }
}

ImageRecordReader类将图像文件名存为Text类型的键，图像信息存为BytesWritable类型的值。也就是将图像转换为Map过程的输入（键值对）

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class ImageRecordReader extends RecordReader<Text, BytesWritable> {

    private Text key=null;
    private BytesWritable value=null;
    private FSDataInputStream fileStream=null;
    private FileSplit filesplit;
    private boolean processed= false;
    private Configuration conf;
    @Override
    public void initialize(InputSplit split, TaskAttemptContext context)
            throws IOException, InterruptedException {
        // TODO Auto-generated method stub
         filesplit = (FileSplit)split;
         conf= context.getConfiguration();


    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        // TODO Auto-generated method stub
        if(!processed){
        Path filePath =filesplit.getPath();
        FileSystem fs= filePath.getFileSystem(conf);

        this.fileStream= fs.open(filePath);
        this.key=new Text(filePath.getName());
        byte[] bytes =new byte[(int) filesplit.getLength()];
        IOUtils.readFully(this.fileStream, bytes, 0, bytes.length);

        this.value= new BytesWritable(bytes);
        IOUtils.closeStream(fileStream);
        processed =true;
        return true;
    }
        return false;
    }


    @Override
    public Text getCurrentKey() throws IOException, InterruptedException {
        // TODO Auto-generated method stub
        return key;
    }

    @Override
    public BytesWritable getCurrentValue() throws IOException, InterruptedException {
        // TODO Auto-generated method stub
        return value;
    }

    @Override
    public float getProgress() throws IOException, InterruptedException {
        // TODO Auto-generated method stub
        return processed ? 1.0f : 0.0f;
    }

    @Override
    public void close() throws IOException {
        // TODO Auto-generated method stub

    }

}

ImageMapper类。

import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.IOException;

import javax.imageio.ImageIO;

import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class ImageMapper extends Mapper<Text, BytesWritable, Text, BytesWritable> {

public void map(Text key, BytesWritable value, Context context)
throws IOException, InterruptedException {

 ByteArrayInputStream  image= new ByteArrayInputStream(value.getBytes());   
         BufferedImage bi=ImageIO.read(image);//从BufferedImage中我们可以得到所需要的所有图的信息。
        int H =  bi.getHeight();
         int W = bi.getWidth();
         context.write(key, value);
         }
    }

下面实现写图像到hdfs.可以使用Hadoop 的OutputFormat 来为MapReduce计算的输出定义数据存储格式、数据存储位置和数据组织形式。Output准备输出位置，并提供一个RecordWriter的实现来执行实际的数据序列化和存储。下面实现的ImageOutputFormat类和ImageRecordWriter类。

import java.awt.image.BufferedImage;
import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class ImageFileOutputFormat extends FileOutputFormat<Text, BufferedImage> {

    @Override
    public RecordWriter<Text, BufferedImage> getRecordWriter(
            TaskAttemptContext job) throws IOException, InterruptedException {

        // TODO Auto-generated method stub
         Configuration conf = job.getConfiguration();  
            Path file = getDefaultWorkFile(job,"");  
            FileSystem fs = file.getFileSystem(conf);  
        return new ImageRecordWriter(file,fs);
    }
}

ImageRecordWriter类。

import java.awt.image.BufferedImage;
import java.io.IOException;

import javax.imageio.ImageIO;

import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

public class ImageRecordWriter extends RecordWriter<Text, BufferedImage> {

    private Path file;
    private FileSystem fs;

    FSDataOutputStream fileStream;

    public ImageRecordWriter(  Path file , FileSystem fs) {
        // TODO Auto-generated constructor stub
        this.file=file;
        this.fs= fs;
    }

    @Override
    public void close(TaskAttemptContext context) throws IOException,
            InterruptedException {
        // TODO Auto-generated method stub
    fileStream.close();

    }

    @Override
    public void write(Text key, BufferedImage value) throws IOException, InterruptedException {
        // TODO Auto-generated method stub
         String name=key.toString();

        Path filePath = new Path(file,name);

        fileStream = fs.create(filePath, false);
        ImageIO.write(value, "BMP", fileStream);

    }

}

ImageReducer类。

import java.awt.Color;
import java.awt.image.BufferedImage;
import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class ImageReducer extends Reducer<Text, BytesWritable, Text, BufferedImage> {

    public void reduce(Text key, Iterable<BytesWritable> values, Context context)
            throws IOException, InterruptedException {
        // process values
        for (ImageWritable val : values) {
       ByteArrayInputStream  image= new ByteArrayInputStream(val.getBytes());   
         BufferedImage bi=ImageIO.read(image);  
       context.write(key, bi);
    }

}

最后我们写一个Driver类，调用自定义的输入输出接口。

import java.awt.image.BufferedImage;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class ImageDriver {

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "ImageDriver");
        job.setJarByClass(ImageDriver.class);

        job.setInputFormatClass(ImageFileInputFormat.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(BytesWritable.class);
        job.setMapperClass(ImageMapper.class);

        job.setReducerClass(ImageReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(BufferedImage.class);
        job.setOutputFormatClass(ImageFileOutputFormat.class);
        // TODO: specify input and output DIRECTORIES (not files)
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        if (!job.waitForCompletion(true))
            return;
    }

}