MapReduce 大量小文件

最新推荐文章于 2023-03-02 21:53:33 发布

clygm22002

最新推荐文章于 2023-03-02 21:53:33 发布

阅读量167

点赞数

文章标签：大数据 java

我们知道 Map的数量一般是由分片数量决定的，而当输入为大量的小文件时，可能就会产生大量的Map任务，并且每个Map任务的任务量都很小。
所以，我们需要将多个小文件交由一个Map 任务来处理，开始想自己写一套InputFormat、 RecordReader、FileSplit。后来得知其实Hadoop早已为
我们提供了CombineFileInputFormat和CombineFileInputFormat。（提供的CombineFileInputFormat会产生数据倾斜问题，后边讲）

CombineFileInputFormat的大致原理是，他会将输入多个数据文件（小文件）的元数据全部包装到CombineFileSplit类里面。也就是说，因为小文件的情况下，在HDFS中都是单Block的文件，即一个文件一个Block，一个CombineFileSplit包含了一组文件Block，包括每个文件的起始偏移（offset），长度（length），Block位置（localtions）等元数据。

（代码大多采用自其它兄弟的博文，自己只是吸收完后在此记录以作备忘，在此先谢过）

1.先看一下CombineFileSplit 的源码：（简简单单五个对象）

public class CombineFileSplit extends InputSplit implements Writable {
private Path[] paths;
private long[] startoffset;
private long[] lengths;
private String[] locations;
private long totLength;
/**
* default constructor
*/
public CombineFileSplit() {}
public CombineFileSplit(Path[] files, long[] start,
long[] lengths, String[] locations) {
initSplit(files, start, lengths, locations);
}
public CombineFileSplit(Path[] files, long[] lengths) {
long[] startoffset = new long[files.length];
for (int i = 0; i < startoffset.length; i++) {
startoffset[i] = 0;
}
String[] locations = new String[files.length];
for (int i = 0; i < locations.length; i++) {
locations[i] = "\";
}
initSplit(files, startoffset, lengths, locations);
}
private void initSplit(Path[] files, long[] start,
long[] lengths, String[] locations) {
this.startoffset = start;
this.lengths = lengths;
this.paths = files;
this.totLength = 0;
this.locations = locations;
for(long length : lengths) {
totLength += length;
}
}
/**
* Copy constructor
*/
public CombineFileSplit(CombineFileSplit old) throws IOException {
this(old.getPaths(), old.getStartOffsets(),
old.getLengths(), old.getLocations());
}
public long getLength() {
return totLength;
}
/** Returns an array containing the start offsets of the files in the split*/
public long[] getStartOffsets() {
return startoffset;
}
/** Returns an array containing the lengths of the files in the split*/
public long[] getLengths() {
return lengths;
}
/** Returns the start offset of the ith Path */
public long getOffset(int i) {
return startoffset[i];
}
/** Returns the length of the ith Path */
public long getLength(int i) {
return lengths[i];
}
/** Returns the number of Paths in the split */
public int getNumPaths() {
return paths.length;
}
/** Returns the ith Path */
public Path getPath(int i) {
return paths[i];
}
/** Returns all the Paths in the split */
public Path[] getPaths() {
return paths;
}
/** Returns all the Paths where this input-split resides */
public String[] getLocations() throws IOException {
return locations;
}
public void readFields(DataInput in) throws IOException {
totLength = in.readLong();
int arrLength = in.readInt();
lengths = new long[arrLength];
for(int i=0; i<arrLength;i++) {
lengths[i] = in.readLong();
}
int filesLength = in.readInt();
paths = new Path[filesLength];
for(int i=0; i<filesLength;i++) {
paths[i] = new Path(Text.readString(in));
}
arrLength = in.readInt();
startoffset = new long[arrLength];
for(int i=0; i<arrLength;i++) {
startoffset[i] = in.readLong();
}
}
public void write(DataOutput out) throws IOException {
out.writeLong(totLength);
out.writeInt(lengths.length);
for(long length : lengths) {
out.writeLong(length);
}
out.writeInt(paths.length);
for(Path p : paths) {
Text.writeString(out, p.toString());
}
out.writeInt(startoffset.length);
for(long length : startoffset) {
out.writeLong(length);
}
}
@Override
public String toString() {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < paths.length; i++) {
if (i == 0 ) {
sb.append("Paths:");
}
sb.append(paths[i].toUri().getPath() + ":" + startoffset[i] +
"+" + lengths[i]);
if (i < paths.length -1) {
sb.append(",");
}
}
if (locations != null) {
String locs = "";
StringBuffer locsb = new StringBuffer();
for (int i = 0; i < locations.length; i++) {
locsb.append(locations[i] + ":");
}
locs = locsb.toString();
sb.append(" Locations:" + locs + "; ");
}
return sb.toString();
}
}

2.接下来我们需要来实现一个 RecordReader，这个 RecordReader其实为 LineRecordReader的封装

import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
public class CombineSmallfileRecordReader extends RecordReader<LongWritable,Text> {
private CombineFileSplit combineFileSplit;
private LineRecordReader lineRecordReader = new LineRecordReader();
private Path[] paths;
private int totalLength;
private int currentIndex;
private float currentProgress = 0;
private LongWritable currentKey;
private Text currentValue = new Text();
public CombineSmallfileRecordReader(CombineFileSplit combineFileSplit, TaskAttemptContext context, Integer index) throws IOException {
super();
this.combineFileSplit = combineFileSplit;
this.currentIndex = index; // 当前要处理的小文件Block在CombineFileSplit中的索引
}
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
this.combineFileSplit = (CombineFileSplit) split;
// 处理CombineFileSplit中的一个小文件Block，因为使用LineRecordReader，需要构造一个FileSplit对象，然后才能够读取数据
FileSplit fileSplit = new FileSplit(combineFileSplit.getPath(currentIndex), combineFileSplit.getOffset(currentIndex), combineFileSplit.getLength(currentIndex), combineFileSplit.getLocations());
lineRecordReader.initialize(fileSplit, context);
this.paths = combineFileSplit.getPaths();
totalLength = paths.length;
context.getConfiguration().set("map.input.file.name", combineFileSplit.getPath(currentIndex).getName());
}
@Override
public LongWritable getCurrentKey() throws IOException, InterruptedException {
currentKey = lineRecordReader.getCurrentKey();
return currentKey;
}
@Override
public Text getCurrentValue() throws IOException, InterruptedException {
currentValue = lineRecordReader.getCurrentValue();
return currentValue;
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (currentIndex >= 0 && currentIndex < totalLength) {
return lineRecordReader.nextKeyValue();
} else {
return false;
}
}
@Override
public float getProgress() throws IOException {
if (currentIndex >= 0 && currentIndex < totalLength) {
currentProgress = (float) currentIndex / totalLength;
return currentProgress;
}
return currentProgress;
}
@Override
public void close() throws IOException {
lineRecordReader.close();
}
}

3.接着实现一个CombineFileInputFormat的子类

import java.io.IOException;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
public class CombineSmallfileInputFormat extends CombineFileInputFormat<LongWritable,Text> {
@Override
public RecordReader<LongWritable,Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException {
CombineFileSplit combineFileSplit = (CombineFileSplit) split;
CombineFileRecordReader<LongWritable,Text> recordReader = new CombineFileRecordReader<LongWritable,Text>(combineFileSplit, context, CombineSmallfileRecordReader.class);
try {
recordReader.initialize(combineFileSplit, context);
} catch (InterruptedException e) {
new RuntimeException("Error to initialize CombineSmallfileRecordReader.");
}
return recordReader;
}
}

4.至此我们已经完成了小文件输入的问题，但是让我们看一下 CombineInputFormat 的分片逻辑

CombineInputFormat 中利用三个参数 maxSplitSize、minSizeNode、minSizeRack 来控制分片：
1.如果指定了maxSplitSize(“mapreduce.input.fileinputformat.split.maxsize”)，那么在同一个节点上的Blocks合并，一个超过maxSplitSize就生成新分片。如果没有指定，则只汇总本节点BLock，暂不分片。
2.如果指定了minSizeNode(“mapreduce.input.fileinputformat.split.minsize.per.node”),那么会把1.中处理剩余的Block，进行合并，如果超过minSizeNode，那么全部作为一个分片。否则这些Block与同一机架 Rack上的块进行合并。
3.每个节点上如上同样的方式处理，然后针对整个Rack的所有Block，按照1.方式处理。剩余部分，如果指定了minSizeRack(“mapreduce.input.fileinputformat.split.minsize.per.rack”)，并且超过minSizeRack，则全部作为一个分片，否则这些Block保留，等待与所有机架上的剩余Block进行汇总处理。

每个机架上都按照1，2，3方式处理，汇总所有处理剩下的部分，再按照1的逻辑处理。再剩余的，作为一个分片。

以上逻辑我们可以知道：

如果只设置maxSplitSize(如job.getConfiguration().set( “mapreduce.input.fileinputformat.split.maxsize” , “33554432″))，那么基本每个分片大小都需凑满maxSplitSize。

如果maxSplitSize，minSizeNode，minSizeRack三个都没有设置，那是所有输入整合成一个分片！

CombineInputFormat 提供的分片机制实在是复杂，并且可能会出现数据倾斜问题。所以我们来自己Override一个简单实用的getSplits方法：

private static final Log LOG = LogFactory.getLog(MultiFileInputFormat.class);
public static final String CONFNAME_INPUT_SPLIT_MAX_NUM = "multifileinputformat.max_split_num";
public static final Integer DEFAULT_MAX_SPLIT_NUM = 50;
public static void setMaxInputSplitNum(Job job, Integer maxSplitNum) {
job.getConfiguration().setInt(CONFNAME_INPUT_SPLIT_MAX_NUM, maxSplitNum);
}
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
// get all the files in input path
List<FileStatus> stats = listStatus(job);
List<InputSplit> splits = new ArrayList<InputSplit>();
if (stats.size() == 0) {
return splits;
}
// 计算split的平均长度
long totalLen = 0;
for (FileStatus stat : stats) {
totalLen += stat.getLen();
}
int maxSplitNum = job.getConfiguration().getInt(CONFNAME_INPUT_SPLIT_MAX_NUM, DEFAULT_MAX_SPLIT_NUM);
int expectSplitNum = maxSplitNum < stats.size() ? maxSplitNum : stats.size();
long averageLen = totalLen / expectSplitNum;
LOG.info("Prepare InputSplit : averageLen(" + averageLen + ") totalLen(" + totalLen
+ ") expectSplitNum(" + expectSplitNum + ") ");
// 设置inputSplit
List<Path> pathLst = new ArrayList<Path>();
List<Long> offsetLst = new ArrayList<Long>();
List<Long> lengthLst = new ArrayList<Long>();
long currentLen = 0;
for (int i = 0; i < stats.size(); i++) {
FileStatus stat = stats.get(i);
pathLst.add(stat.getPath());
offsetLst.add(0L);
lengthLst.add(stat.getLen());
currentLen += stat.getLen();
if (splits.size() < expectSplitNum - 1 && currentLen > averageLen) {
Path[] pathArray = new Path[pathLst.size()];
CombineFileSplit thissplit = new CombineFileSplit(pathLst.toArray(pathArray),
getLongArray(offsetLst), getLongArray(lengthLst), new String[0]);
LOG.info("combineFileSplit(" + splits.size() + ") fileNum(" + pathLst.size()
+ ") length(" + currentLen + ")");
splits.add(thissplit);
pathLst.clear();
offsetLst.clear();
lengthLst.clear();
currentLen = 0;
}
}
if (pathLst.size() > 0) {
Path[] pathArray = new Path[pathLst.size()];
CombineFileSplit thissplit =
new CombineFileSplit(pathLst.toArray(pathArray), getLongArray(offsetLst),
getLongArray(lengthLst), new String[0]);
LOG.info("combineFileSplit(" + splits.size() + ") fileNum(" + pathLst.size()
+ ") length(" + currentLen + ")");
splits.add(thissplit);
}
return splits;
}
private long[] getLongArray(List<Long> lst) {
long[] rst = new long[lst.size()];
for (int i = 0; i < lst.size(); i++) {
rst[i] = lst.get(i);
}
return rst;
}

这样就可以通过 multifileinputformat.max_split_num 参数准确的控制map数量，且会均匀的分发数据到各Map

来自 “ ITPUB博客 ” ，链接：http://blog.itpub.net/29754888/viewspace-1225105/，如需转载，请注明出处，否则将追究法律责任。

转载于:http://blog.itpub.net/29754888/viewspace-1225105/

clygm22002

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
MapReduce 大量小文件

我们知道Map的数量一般是由分片数量决定的，而当输入为大量的小文件时，可能就会产生大量的Map任务，并且每个Map任务的任务量都很小。所以，我们需要将多个小文件交由一个Map 任务来处理，开始想自己写一套InputFor...
复制链接

扫一扫