CombinFIleInputFormat通常默认情况下是把合并后的文件一行一行读入到map中,这里给改成了每次往map中读入整个小文件的内容
----------------------------------------------------------------------------------------------------------------------------
CombineSmallfileInputFormat.java
package combinAllFile;
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
public class CombineSmallfileInputFormat extends CombineFileInputFormat<LongWritable, BytesWritable> {
@Override
public RecordReader<LongWritable, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException {
CombineFileSplit combineFileSplit = (CombineFileSplit) split;
CombineFileRecordReader<LongWritable, BytesWritable> recordReader = new CombineFileRecordReader<LongWritable, BytesWritable>(combineFileSplit, context, CombineSmallfileRecordReader.class);
try {
recordReader.initialize(combineFileSplit, context);
} catch (InterruptedException e) {
new RuntimeException("Error to initialize CombineSmallfileRecordReader.");
}
return recordReader;
}
@Override
protected boolean isSplitable(JobContext context, Path file)
{
return false;
}
}
------------------------------------------------------------------------------------------------------------------------------------------------------
在RecordReader中设置读取格式,及获取文件名
CombineSmallfileRecordReader .java
package combinAllFile;
import java.io.IOException;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
public class CombineSmallfileRecordReader extends RecordReader<LongWritable, BytesWritable> {
private CombineFileSplit combineFileSplit;
private Path[] paths;
private int totalLength;
private int currentIndex;//当前小文件在split中的索引
private float currentProgress = 0;
private LongWritable currentKey;
private BytesWritable currentValue = new BytesWritable();
private FileSplit fileSplit;
private boolean finishConverting = false;
private JobContext jobContext;
public CombineSmallfileRecordReader(CombineFileSplit combineFileSplit, TaskAttemptContext context, Integer index) throws IOException {
super();
this.combineFileSplit = combineFileSplit;
this.currentIndex = index; //当前要处理的小文件Block在CombineFileSplit中的索引
}
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
this.combineFileSplit = (CombineFileSplit) split;
// 此时的split中放的是文件数组,比如0-9,即10个文件,里面都是每个文件的文件全路径及该文件长度
fileSplit = new FileSplit(combineFileSplit.getPath(currentIndex), combineFileSplit.getOffset(currentIndex), combineFileSplit.getLength(currentIndex), combineFileSplit.getLocations());
this.jobContext = context;
this.paths = combineFileSplit.getPaths();
totalLength = paths.length;
context.getConfiguration().set("map.input.file.name", combineFileSplit.getPath(currentIndex).getName());
}
@Override
public LongWritable getCurrentKey() throws IOException, InterruptedException {
return currentKey;
}
@Override
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
return currentValue;
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (currentIndex >= 0 && currentIndex < totalLength) {
if (!finishConverting) {
currentValue = new BytesWritable();
int len = (int) fileSplit.getLength();
byte[] content = new byte[len];
Path file = fileSplit.getPath();
FileSystem fs = file.getFileSystem(jobContext.getConfiguration());
FSDataInputStream in = null;
try {
in = fs.open(file);
IOUtils.readFully(in, content, 0, len);
currentValue.set(content, 0, len);
} finally {
if (in != null) {
IOUtils.closeStream(in);
}
}
finishConverting = true;
return true;
}
} else {
return false;
}
return false;
}
@Override
public float getProgress() throws IOException {
/*if (currentIndex >= 0 && currentIndex < totalLength) {
currentProgress = (float) currentIndex / totalLength;
return currentProgress;
}
return currentProgress;*/
float progress = 0;
if (finishConverting) {
progress = 1;
}
return progress;
}
@Override
public void close() throws IOException {
// lineRecordReader.close();
}
}
-----------------------------------------------------------------------------------------------------------------------
CombineSmallfileMapper.java
package CombineLine;
import java.io.IOException;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class CombineSmallfileMapper extends Mapper<LongWritable, BytesWritable, Text, BytesWritable> {
private Text file = new Text();
@Override
protected void map(LongWritable key, BytesWritable value, Context context) throws IOException, InterruptedException {
String fileName = context.getConfiguration().get("map.input.file.name");
String ybString =new String(value.getBytes(),0,value.getLength());//把value从ByteWritable转成String格式
file.set(fileName);
context.write(file, value);
}
}
-----------------------------------------------------------------------------------------
CombineSmallfiles.java 程序入口
package CombineLine;
import java.io.IOException;
import java.util.Calendar;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import CombineLine.IdentityReducer;
public class CombineSmallfiles {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
JobConf conf = new JobConf();
long start = System.currentTimeMillis();
args = new String[2];
args[0] = "hdfs://master:9000/newdata/YB2";
args[1] = "hdfs://master:9000/NoDelete1/tes-"+Calendar.getInstance().getTimeInMillis();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: conbinesmallfiles <in> <out>");
System.exit(2);
}
conf.set("fs.default.name", "hdfs://master:9000");
conf.set("mapred.job.tracker", "master:9001");
conf.setNumMapTasks(20);
Job job = new Job(conf, "combine smallfiles");
job.setJarByClass(CombineSmallfiles.class);
job.setMapperClass(CombineSmallfileMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(BytesWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class);
job.setInputFormatClass(CombineSmallfileInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);//若想将value以Text格式输出,则此设置不能要,当然,还得将上面java文件的value格式改变
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
while (!job.waitForCompletion(true)) {
}
long end = System.currentTimeMillis();
System.out.println(end - start);
}
}