数据输入格式InputFormat用于描述MapReduce作业的数据输入规范。MapReduce框架依靠数据输入格式完成输入规范检查(比如输入文件目录的检查、对数据文件进行输入分块(InputSplit),从输入分片中将数据记录逐一读出、并转换为Map过程的输入(键值对)。Hadoop 使我们能够实现自定义的InputFormat,从而实现我们自定义的Mapreduce计算。可以实现自定义的InputFormat类,从而更好的控制输入数据,以支持专用的或特定的应用程序的输入数据的文件格式,InputFormat实现应该扩展org.apache.hadoop.mapreduce.InputFormat
抽象类,并重写createRecordReader()和getSplit()方法。
下面将实现基于FlieInputFormat的自定义Inputformat也就是ImageInputFormat,和ImageRecordReader()。
import java.io.IOException;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
public class ImageFileInputFormat extends FileInputFormat<Text, BytesWritable> {
@Override
protected boolean isSplitable(JobContext context, Path ){
return false;//保证单个图不被分割
}
@Override
public RecordReader<Text, BytesWritable> createRecordReader(InputSplit arg0,
TaskAttemptContext arg1) throws IOException, InterruptedException {
// TODO Auto-generated method stub
return new ImageRecordReader();
}
}
ImageRecordReader类将图像文件名存为Text类型的键,图像信息存为BytesWritable类型的值。也就是将图像转换为Map过程的输入(键值对)
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
public class ImageRecordReader extends RecordReader<Text, BytesWritable> {
private Text key=null;
private BytesWritable value=null;
private FSDataInputStream fileStream=null;
private FileSplit filesplit;
private boolean processed= false;
private Configuration conf;
@Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
filesplit = (FileSplit)split;
conf= context.getConfiguration();
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
// TODO Auto-generated method stub
if(!processed){
Path filePath =filesplit.getPath();
FileSystem fs= filePath.getFileSystem(conf);
this.fileStream= fs.open(filePath);
this.key=new Text(filePath.getName());
byte[] bytes =new byte[(int) filesplit.getLength()];
IOUtils.readFully(this.fileStream, bytes, 0, bytes.length);
this.value= new BytesWritable(bytes);
IOUtils.closeStream(fileStream);
processed =true;
return true;
}
return false;
}
@Override
public Text getCurrentKey() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return key;
}
@Override
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return processed ? 1.0f : 0.0f;
}
@Override
public void close() throws IOException {
// TODO Auto-generated method stub
}
}
ImageMapper类。
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import javax.imageio.ImageIO;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class ImageMapper extends Mapper<Text, BytesWritable, Text, BytesWritable> {
public void map(Text key, BytesWritable value, Context context)
throws IOException, InterruptedException {
ByteArrayInputStream image= new ByteArrayInputStream(value.getBytes());
BufferedImage bi=ImageIO.read(image);//从BufferedImage中我们可以得到所需要的所有图的信息。
int H = bi.getHeight();
int W = bi.getWidth();
context.write(key, value);
}
}
下面实现写图像到hdfs.可以使用Hadoop 的OutputFormat 来为MapReduce计算的输出定义数据存储格式、数据存储位置和数据组织形式。Output准备输出位置,并提供一个RecordWriter的实现来执行实际的数据序列化和存储。下面实现的ImageOutputFormat类和ImageRecordWriter类。
import java.awt.image.BufferedImage;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class ImageFileOutputFormat extends FileOutputFormat<Text, BufferedImage> {
@Override
public RecordWriter<Text, BufferedImage> getRecordWriter(
TaskAttemptContext job) throws IOException, InterruptedException {
// TODO Auto-generated method stub
Configuration conf = job.getConfiguration();
Path file = getDefaultWorkFile(job,"");
FileSystem fs = file.getFileSystem(conf);
return new ImageRecordWriter(file,fs);
}
}
ImageRecordWriter类。
import java.awt.image.BufferedImage;
import java.io.IOException;
import javax.imageio.ImageIO;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
public class ImageRecordWriter extends RecordWriter<Text, BufferedImage> {
private Path file;
private FileSystem fs;
FSDataOutputStream fileStream;
public ImageRecordWriter( Path file , FileSystem fs) {
// TODO Auto-generated constructor stub
this.file=file;
this.fs= fs;
}
@Override
public void close(TaskAttemptContext context) throws IOException,
InterruptedException {
// TODO Auto-generated method stub
fileStream.close();
}
@Override
public void write(Text key, BufferedImage value) throws IOException, InterruptedException {
// TODO Auto-generated method stub
String name=key.toString();
Path filePath = new Path(file,name);
fileStream = fs.create(filePath, false);
ImageIO.write(value, "BMP", fileStream);
}
}
ImageReducer类。
import java.awt.Color;
import java.awt.image.BufferedImage;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class ImageReducer extends Reducer<Text, BytesWritable, Text, BufferedImage> {
public void reduce(Text key, Iterable<BytesWritable> values, Context context)
throws IOException, InterruptedException {
// process values
for (ImageWritable val : values) {
ByteArrayInputStream image= new ByteArrayInputStream(val.getBytes());
BufferedImage bi=ImageIO.read(image);
context.write(key, bi);
}
}
最后我们写一个Driver类,调用自定义的输入输出接口。
import java.awt.image.BufferedImage;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class ImageDriver {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "ImageDriver");
job.setJarByClass(ImageDriver.class);
job.setInputFormatClass(ImageFileInputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(BytesWritable.class);
job.setMapperClass(ImageMapper.class);
job.setReducerClass(ImageReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BufferedImage.class);
job.setOutputFormatClass(ImageFileOutputFormat.class);
// TODO: specify input and output DIRECTORIES (not files)
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
if (!job.waitForCompletion(true))
return;
}
}
以上程序仅仅实现了图像的读写没有做任何计算。可以在Mapper类和Reducer类中实现相应的计算。