模拟小需求
对多个小文件进行读取,设置不可切片,传给map方法得key为小文件的完整路径,value为整个小文件的内容。
输出的时候合并成一个SequenceFile文件(SequenceFile文件是Hadoop用来存储二进制形式的key-value对的文件格式),SequenceFile里面存储着多个文件,存储的形式为文件路径+名称为key,文件内容为value。
自定义步骤
RecordReader是在map之前调用,负责给map函数传key和value值
相关代码
CustomInputFormat.java
package MapReduceCustomInputFormat;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import java.io.IOException;
public class CustomInputFormat extends FileInputFormat<Text, BytesWritable> {
@Override
protected boolean isSplitable(JobContext context, Path filename) {
return false;
}
@Override
public RecordReader<Text, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
CustomRecordReader crr = new CustomRecordReader();
crr.initialize(split,context);
return crr;
}
}
CustomRecordReader.java
package MapReduceCustomInputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
public class CustomRecordReader extends RecordReader<Text, BytesWritable> {
private FileSplit fsplit = new FileSplit();
private Configuration conf = new Configuration();
private Text key = new Text();
private BytesWritable value = new BytesWritable();
private boolean flag = true;
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
fsplit = (FileSplit)split;
conf = context.getConfiguration();
}
/*
* 核心任务:给key和value赋值
*/
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if(flag){
//获取文件路径给key赋值
key.set(fsplit.getPath().toString());
//获取文件二进制内容给value赋值
FileSystem fs = FileSystem.get(conf);
FSDataInputStream fis = fs.open(fsplit.getPath());
byte[] bytes = new byte[(int)fsplit.getLength()];
//fis.read(bytes,0,bytes.length);
IOUtils.readFully(fis,bytes,0,bytes.length);
value.set(bytes,0,bytes.length);
fs.close();
flag = false;
return true;
}
return flag;
}
@Override
public Text getCurrentKey() throws IOException, InterruptedException {
return key;
}
@Override
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
return value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
return 0;
}
@Override
public void close() throws IOException {
}
}
CustomDriver.java
package MapReduceCustomInputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import java.io.IOException;
public class CustomDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(CustomDriver.class);
job.setMapperClass(CustomMapper.class);
job.setReducerClass(CustomReducer.class);
job.setInputFormatClass(CustomInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(BytesWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class);
FileInputFormat.setInputPaths(job,new Path("F:\\Codes\\JavaCodes\\MapReduceLearning\\testdata\\1"));
FileOutputFormat.setOutputPath(job,new Path("F:\\Codes\\JavaCodes\\MapReduceLearning\\testdata\\1\\output"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}