自定义InputFormat
自定义的InputFormat永来将小文件拼接成一个大文件,其中小文件的大小长度要小于 Integer.MAX 的大小,因为在网上学习到的例子文件读写的游标值类型为int类型,所以有限制
小文件处理方式有
hadoop har 方式打包小文件
CombineInputFormat
还有当前自定义的InputFormat
代码
DiyInputFormat
package com.xdc.diy;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import java.io.IOException;
/**
* @author xdc
* created by 2019/11/7
*/
public class DiyInputFormat extends FileInputFormat {
@Override
protected boolean isSplitable(JobContext context, Path filename) {
return super.isSplitable(context, filename);
}
@Override
public RecordReader createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
DiyRecordReader diyRecordReader = new DiyRecordReader();
diyRecordReader.initialize(inputSplit, taskAttemptContext);
return diyRecordReader;
}
}
DiyRecordReader
package com.xdc.diy;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
/**
* @author xdc
* created by 2019/11/7
*/
public class DiyRecordReader extends RecordReader<NullWritable, BytesWritable> {
private long length = 0;
private Configuration configuration;
private FileSplit inputSplit;
private BytesWritable value = new BytesWritable();
/**
* 每个maptask只会跑一次
*/
private boolean isProcess = true;
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
this.inputSplit = (FileSplit)inputSplit;
//获取配置信息
configuration = taskAttemptContext.getConfiguration();
}
/**
* 获取下一处理键值对信息
* */
@Override
public boolean nextKeyValue() {
if (isProcess) {
//获取文件路径地址信息
Path path = inputSplit.getPath();
//获取文件长度
length = inputSplit.getLength();
//获取文件系统信息
FileSystem fs = null;
FSDataInputStream inputStream = null;
try {
fs = FileSystem.get(configuration);
//获取对应文件输入流信息
inputStream = fs.open(path);
//文件长度不能超过 Integer.Max 受文件读取游标长度限制
byte[] bytes = new byte[(int) length];
IOUtils.readFully(inputStream, bytes, 0, bytes.length);
value.set(bytes, 0, bytes.length);
fs.close();
IOUtils.closeStream(inputStream);
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (inputStream != null) {
IOUtils.closeStream(inputStream);
}
if (null != fs) {
fs.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
isProcess = false;
return true;
}
return false;
}
@Override
public NullWritable getCurrentKey() throws IOException, InterruptedException {
return NullWritable.get();
}
@Override
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
//获取文件长度
return value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
return isProcess ? 0 : 1;
}
@Override
public void close() throws IOException {
}
}
DiyMapper
package com.xdc.diy;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
/**
* @author xdc
* created by 2019/11/7
*/
public class DiyMapper extends Mapper<NullWritable, BytesWritable, Text, BytesWritable> {
private Text text = new Text();
@Override
protected void map(NullWritable key, BytesWritable value, Context context) throws IOException, InterruptedException {
//获取切片信息
FileSplit inputSplit = (FileSplit)context.getInputSplit();
//获取key值信息
String path = inputSplit.getPath().toString();
context.write(text, value);
}
}
DiyDriver
package com.xdc.diy;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import java.io.IOException;
/**
* @author xdc
* created by 2019/11/7
*/
public class DiyDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
args = new String[]{"D:\\hadoopwork\\input", "D:\\hadoopwork\\output"};
//创建配置信息
Configuration conf = new Configuration();
//创建人物信息
Job job = Job.getInstance(conf);
//添加mapper对象类型
job.setJarByClass(DiyDriver.class);
job.setMapperClass(DiyMapper.class);
//设置输入类型信息
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(BytesWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class);
job.setInputFormatClass(DiyInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}