Hadoop MapReduce处理小的压缩文件：基于CombineFileInputFormat

最新推荐文章于 2021-12-08 08:51:55 发布

cao_yaqi

最新推荐文章于 2021-12-08 08:51:55 发布

阅读量594

点赞数

分类专栏： Hadoop 文章标签： Hadoop 小文件

Hadoop 专栏收录该内容

10 篇文章 0 订阅

订阅专栏

处理压缩小文件的，不是压缩文件，代码也是可以用的，只是这时候每个split是一个小文件

其他就不在多说了，提供3个完整java文件，只是缺少map和reduce的代码、main的代码，而且缺少的这几个java文件在前几个博客中已经提供了CompressedCombineFileInputFormat .java

package compressedCombineFile;

import java.io.IOException;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;

public class CompressedCombineFileInputFormat
extends CombineFileInputFormat<CompressedCombineFileWritable, Text> {

public CompressedCombineFileInputFormat(){
super();

}

public RecordReader<CompressedCombineFileWritable,Text>
createRecordReader(InputSplit split,
TaskAttemptContext context) throws IOException {
return new
CombineFileRecordReader<CompressedCombineFileWritable,
Text>((CombineFileSplit)split, context,
CompressedCombineFileRecordReader.class);
}

@Override
protected boolean isSplitable(JobContext context, Path file){
return false;
}
}

CompressedCombineFileRecordReader .java 设置读取格式

package compressedCombineFile;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
import org.apache.hadoop.util.LineReader;

/**
* RecordReader is responsible from extracting records from a chunk
* of the CombineFileSplit.
*/
public class CompressedCombineFileRecordReader
extends RecordReader<CompressedCombineFileWritable, Text> {

private long startOffset;
private long end;
private long pos;
private FileSystem fs;
private Path path;
private Path dPath;
private CompressedCombineFileWritable key = new CompressedCombineFileWritable();
private Text value;
private long rlength;
private FSDataInputStream fileIn;
private LineReader reader;

public CompressedCombineFileRecordReader(CombineFileSplit split,
TaskAttemptContext context, Integer index) throws IOException {

Configuration currentConf = context.getConfiguration();
this.path = split.getPath(index);
boolean isCompressed = findCodec(currentConf ,path);
if(isCompressed)
codecWiseDecompress(context.getConfiguration());

fs = this.path.getFileSystem(currentConf);

this.startOffset = split.getOffset(index);

if(isCompressed){
this.end = startOffset + rlength;
}else{
this.end = startOffset + split.getLength(index);
dPath =path;
}

boolean skipFirstLine = false;

fileIn = fs.open(dPath);

if(isCompressed) fs.deleteOnExit(dPath);

if (startOffset != 0) {
skipFirstLine = true;
--startOffset;
fileIn.seek(startOffset);
}
reader = new LineReader(fileIn);
if (skipFirstLine) {
startOffset += reader.readLine(new Text(), 0,
(int)Math.min((long)Integer.MAX_VALUE, end - startOffset));
}
this.pos = startOffset;
}

public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
}

public void close() throws IOException { }

public float getProgress() throws IOException {
if (startOffset == end) {
return 0.0f;
} else {
return Math.min(1.0f, (pos - startOffset) / (float)
(end - startOffset));
}
}

public boolean nextKeyValue() throws IOException {
if (key.fileName== null) {
key = new CompressedCombineFileWritable();
key.fileName = dPath.getName();
}
key.offset = pos;
if (value == null) {
value = new Text();
}
int newSize = 0;
if (pos < end) {
newSize = reader.readLine(value);
pos += newSize;
}
if (newSize == 0) {
key = null;
value = null;
return false;
} else {
return true;
}
}

public CompressedCombineFileWritable getCurrentKey()
throws IOException, InterruptedException {
return key;
}

public Text getCurrentValue() throws IOException, InterruptedException {
return value;
}

private void codecWiseDecompress(Configuration conf) throws IOException{

CompressionCodecFactory factory = new CompressionCodecFactory(conf);
CompressionCodec codec = factory.getCodec(path);

if (codec == null) {
System.err.println("No Codec Found For " + path);
System.exit(1);
}

String outputUri =
CompressionCodecFactory.removeSuffix(path.toString(),
codec.getDefaultExtension());
dPath = new Path(outputUri);

InputStream in = null;
OutputStream out = null;
fs = this.path.getFileSystem(conf);

try {
in = codec.createInputStream(fs.open(path));
out = fs.create(dPath);
IOUtils.copyBytes(in, out, conf);
} finally {
IOUtils.closeStream(in);
IOUtils.closeStream(out);
rlength = fs.getFileStatus(dPath).getLen();
}
}

private boolean findCodec(Configuration conf, Path p){

CompressionCodecFactory factory = new CompressionCodecFactory(conf);
CompressionCodec codec = factory.getCodec(path);

if (codec == null)
return false;
else
return true;

}

}

CompressedCombineFileWritable .java

package compressedCombineFile;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;

/**
* This record keeps filename,offset pairs.
*/

@SuppressWarnings("rawtypes")

public class CompressedCombineFileWritable implements WritableComparable {
    public long offset;
public String fileName;

    public CompressedCombineFileWritable() {
super();
}

public CompressedCombineFileWritable(long offset, String fileName) {
super();
this.offset = offset;
this.fileName = fileName;
}

public void readFields(DataInput in) throws IOException {
this.offset = in.readLong();
this.fileName = Text.readString(in);
}

    public void write(DataOutput out) throws IOException {
out.writeLong(offset);
Text.writeString(out, fileName);
}

    public int compareTo(Object o) {
CompressedCombineFileWritable that = (CompressedCombineFileWritable)o;

int f = this.fileName.compareTo(that.fileName);
if(f == 0) {
return (int)Math.signum((double)(this.offset - that.offset));
}
return f;
}
@Override
public boolean equals(Object obj) {
if(obj instanceof CompressedCombineFileWritable)
return this.compareTo(obj) == 0;
return false;
}
@Override
public int hashCode() {

final int hashPrime = 47;
int hash = 13;
hash = hashPrime* hash + (this.fileName != null ? this.fileName.hashCode() :0);
hash = hashPrime* hash + (int) (this.offset ^ (this.offset >>> 16));

return hash;
}
@Override
public String toString(){
return this.fileName+"-"+this.offset;
}

}