inputFormat,RecordRead的理解和定制

最新推荐文章于 2022-03-14 18:58:20 发布

杨过悔

最新推荐文章于 2022-03-14 18:58:20 发布

阅读量964

点赞数

分类专栏： Hadoop笔记文章标签： hadoop split record

本文链接：https://blog.csdn.net/u013571243/article/details/51055836

版权

Hadoop笔记专栏收录该内容

41 篇文章 0 订阅

订阅专栏

先看看InputFormat接口和

public interface InputFormat<K, V> {
    InputSplit[] getSplits(JobConf var1, int var2) throws IOException;

    RecordReader<K, V> getRecordReader(InputSplit var1, JobConf var2, Reporter var3)

    throws IOException;
     
}
 
 
public interface RecordReader<K, V> {
    boolean next(K var1, V var2) throws IOException;

    K createKey();

    V createValue();

    long getPos() throws IOException;

    void close() throws IOException;

    float getProgress() throws IOException;
}
 
其中Input定义了如何分割数据,recordread定义了如何给mapper提供k,v
 
我们如何实现定制呢?
 
 
 
Inputformat有个实现版本,FileInputFormat,里面的代码很值得去查看
<span style="font-size:18px;">//
// Source code recreated from a .class file by IntelliJ IDEA
// (powered by Fernflower decompiler)
//

package org.apache.hadoop.mapred;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.InvalidInputException;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.security.TokenCache;
import org.apache.hadoop.net.NetworkTopology;
import org.apache.hadoop.net.Node;
import org.apache.hadoop.net.NodeBase;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.StringUtils;

public abstract class FileInputFormat<K, V> implements InputFormat<K, V> {
    public static final Log LOG = LogFactory.getLog(FileInputFormat.class);
    private static final double SPLIT_SLOP = 1.1D;
    private long minSplitSize = 1L;
    private static final PathFilter hiddenFileFilter = new PathFilter() {
        public boolean accept(Path p) {
            String name = p.getName();
            return !name.startsWith("_") && !name.startsWith(".");
        }
    };
    static final String NUM_INPUT_FILES = "mapreduce.input.num.files";

    public FileInputFormat() {
    }

    protected void setMinSplitSize(long minSplitSize) {
        this.minSplitSize = minSplitSize;
    }

    protected boolean isSplitable(FileSystem fs, Path filename) {
        return true;
    }

    public abstract RecordReader<K, V> getRecordReader(InputSplit var1, JobConf var2, Reporter var3) throws IOException;

    public static void setInputPathFilter(JobConf conf, Class<? extends PathFilter> filter) {
        conf.setClass("mapred.input.pathFilter.class", filter, PathFilter.class);
    }

    public static PathFilter getInputPathFilter(JobConf conf) {
        Class filterClass = conf.getClass("mapred.input.pathFilter.class", (Class)null, PathFilter.class);
        return filterClass != null?(PathFilter)ReflectionUtils.newInstance(filterClass, conf):null;
    }

    protected FileStatus[] listStatus(JobConf job) throws IOException {
        Path[] dirs = getInputPaths(job);
        if(dirs.length == 0) {
            throw new IOException("No input paths specified in job");
        } else {
            TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job);
            ArrayList result = new ArrayList();
            ArrayList errors = new ArrayList();
            ArrayList filters = new ArrayList();
            filters.add(hiddenFileFilter);
            PathFilter jobFilter = getInputPathFilter(job);
            if(jobFilter != null) {
                filters.add(jobFilter);
            }

            FileInputFormat.MultiPathFilter inputFilter = new FileInputFormat.MultiPathFilter(filters);
            Path[] arr$ = dirs;
            int len$ = dirs.length;

            for(int i$ = 0; i$ < len$; ++i$) {
                Path p = arr$[i$];
                FileSystem fs = p.getFileSystem(job);
                FileStatus[] matches = fs.globStatus(p, inputFilter);
                if(matches == null) {
                    errors.add(new IOException("Input path does not exist: " + p));
                } else if(matches.length == 0) {
                    errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
                } else {
                    FileStatus[] arr$1 = matches;
                    int len$1 = matches.length;

                    for(int i$1 = 0; i$1 < len$1; ++i$1) {
                        FileStatus globStat = arr$1[i$1];
                        if(globStat.isDir()) {
                            FileStatus[] arr$2 = fs.listStatus(globStat.getPath(), inputFilter);
                            int len$2 = arr$2.length;

                            for(int i$2 = 0; i$2 < len$2; ++i$2) {
                                FileStatus stat = arr$2[i$2];
                                result.add(stat);
                            }
                        } else {
                            result.add(globStat);
                        }
                    }
                }
            }

            if(!errors.isEmpty()) {
                throw new InvalidInputException(errors);
            } else {
                LOG.info("Total input paths to process : " + result.size());
                return (FileStatus[])result.toArray(new FileStatus[result.size()]);
            }
        }
    }

    public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
        FileStatus[] files = this.listStatus(job);
        job.setLong("mapreduce.input.num.files", (long)files.length);
        long totalSize = 0L;
        FileStatus[] goalSize = files;
        int len$ = files.length;

        for(int minSize = 0; minSize < len$; ++minSize) {
            FileStatus file = goalSize[minSize];
            if(file.isDir()) {
                throw new IOException("Not a file: " + file.getPath());
            }

            totalSize += file.getLen();
        }

        long var29 = totalSize / (long)(numSplits == 0?1:numSplits);
        long var30 = Math.max(job.getLong("mapred.min.split.size", 1L), this.minSplitSize);
        ArrayList splits = new ArrayList(numSplits);
        NetworkTopology clusterMap = new NetworkTopology();
        FileStatus[] arr$ = files;
        int len$1 = files.length;

        for(int i$ = 0; i$ < len$1; ++i$) {
            FileStatus file1 = arr$[i$];
            Path path = file1.getPath();
            FileSystem fs = path.getFileSystem(job);
            long length = file1.getLen();
            BlockLocation[] blkLocations = fs.getFileBlockLocations(file1, 0L, length);
            if(length != 0L && this.isSplitable(fs, path)) {
                long var28 = file1.getBlockSize();
                long splitSize = this.computeSplitSize(var29, var30, var28);

                long bytesRemaining;
                for(bytesRemaining = length; (double)bytesRemaining / (double)splitSize > 1.1D; bytesRemaining -= splitSize) {
                    String[] splitHosts1 = this.getSplitHosts(blkLocations, length - bytesRemaining, splitSize, clusterMap);
                    splits.add(new FileSplit(path, length - bytesRemaining, splitSize, splitHosts1));
                }

                if(bytesRemaining != 0L) {
                    splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts()));
                }
            } else if(length != 0L) {
                String[] splitHosts = this.getSplitHosts(blkLocations, 0L, length, clusterMap);
                splits.add(new FileSplit(path, 0L, length, splitHosts));
            } else {
                splits.add(new FileSplit(path, 0L, length, new String[0]));
            }
        }

        LOG.debug("Total # of splits: " + splits.size());
        return (InputSplit[])splits.toArray(new FileSplit[splits.size()]);
    }

    protected long computeSplitSize(long goalSize, long minSize, long blockSize) {
        return Math.max(minSize, Math.min(goalSize, blockSize));
    }

    protected int getBlockIndex(BlockLocation[] blkLocations, long offset) {
        for(int last = 0; last < blkLocations.length; ++last) {
            if(blkLocations[last].getOffset() <= offset && offset < blkLocations[last].getOffset() + blkLocations[last].getLength()) {
                return last;
            }
        }

        BlockLocation var7 = blkLocations[blkLocations.length - 1];
        long fileLength = var7.getOffset() + var7.getLength() - 1L;
        throw new IllegalArgumentException("Offset " + offset + " is outside of file (0.." + fileLength + ")");
    }

    public static void setInputPaths(JobConf conf, String commaSeparatedPaths) {
        setInputPaths(conf, (Path[])StringUtils.stringToPath(getPathStrings(commaSeparatedPaths)));
    }

    public static void addInputPaths(JobConf conf, String commaSeparatedPaths) {
        String[] arr$ = getPathStrings(commaSeparatedPaths);
        int len$ = arr$.length;

        for(int i$ = 0; i$ < len$; ++i$) {
            String str = arr$[i$];
            addInputPath(conf, new Path(str));
        }

    }

    public static void setInputPaths(JobConf conf, Path... inputPaths) {
        Path path = new Path(conf.getWorkingDirectory(), inputPaths[0]);
        StringBuffer str = new StringBuffer(StringUtils.escapeString(path.toString()));

        for(int i = 1; i < inputPaths.length; ++i) {
            str.append(",");
            path = new Path(conf.getWorkingDirectory(), inputPaths[i]);
            str.append(StringUtils.escapeString(path.toString()));
        }

        conf.set("mapred.input.dir", str.toString());
    }

    public static void addInputPath(JobConf conf, Path path) {
        path = new Path(conf.getWorkingDirectory(), path);
        String dirStr = StringUtils.escapeString(path.toString());
        String dirs = conf.get("mapred.input.dir");
        conf.set("mapred.input.dir", dirs == null?dirStr:dirs + "," + dirStr);
    }

    private static String[] getPathStrings(String commaSeparatedPaths) {
        int length = commaSeparatedPaths.length();
        int curlyOpen = 0;
        int pathStart = 0;
        boolean globPattern = false;
        ArrayList pathStrings = new ArrayList();

        for(int i = 0; i < length; ++i) {
            char ch = commaSeparatedPaths.charAt(i);
            switch(ch) {
            case ',':
                if(!globPattern) {
                    pathStrings.add(commaSeparatedPaths.substring(pathStart, i));
                    pathStart = i + 1;
                }
                break;
            case '{':
                ++curlyOpen;
                if(!globPattern) {
                    globPattern = true;
                }
                break;
            case '}':
                --curlyOpen;
                if(curlyOpen == 0 && globPattern) {
                    globPattern = false;
                }
            }
        }

        pathStrings.add(commaSeparatedPaths.substring(pathStart, length));
        return (String[])pathStrings.toArray(new String[0]);
    }

    public static Path[] getInputPaths(JobConf conf) {
        String dirs = conf.get("mapred.input.dir", "");
        String[] list = StringUtils.split(dirs);
        Path[] result = new Path[list.length];

        for(int i = 0; i < list.length; ++i) {
            result[i] = new Path(StringUtils.unEscapeString(list[i]));
        }

        return result;
    }

    private void sortInDescendingOrder(List<FileInputFormat.NodeInfo> mylist) {
        Collections.sort(mylist, new Comparator() {
            public int compare(FileInputFormat.NodeInfo obj1, FileInputFormat.NodeInfo obj2) {
                return obj1 != null && obj2 != null?(obj1.getValue() == obj2.getValue()?0:(obj1.getValue() < obj2.getValue()?1:-1)):-1;
            }
        });
    }

    protected String[] getSplitHosts(BlockLocation[] blkLocations, long offset, long splitSize, NetworkTopology clusterMap) throws IOException {
        int startIndex = this.getBlockIndex(blkLocations, offset);
        long bytesInThisBlock = blkLocations[startIndex].getOffset() + blkLocations[startIndex].getLength() - offset;
        if(bytesInThisBlock >= splitSize) {
            return blkLocations[startIndex].getHosts();
        } else {
            long bytesInFirstBlock = bytesInThisBlock;
            int index = startIndex + 1;

            for(splitSize -= bytesInThisBlock; splitSize > 0L; splitSize -= bytesInThisBlock) {
                bytesInThisBlock = Math.min(splitSize, blkLocations[index++].getLength());
            }

            long bytesInLastBlock = bytesInThisBlock;
            int endIndex = index - 1;
            IdentityHashMap hostsMap = new IdentityHashMap();
            IdentityHashMap racksMap = new IdentityHashMap();
            String[] allTopos = new String[0];

            for(index = startIndex; index <= endIndex; ++index) {
                if(index == startIndex) {
                    bytesInThisBlock = bytesInFirstBlock;
                } else if(index == endIndex) {
                    bytesInThisBlock = bytesInLastBlock;
                } else {
                    bytesInThisBlock = blkLocations[index].getLength();
                }

                allTopos = blkLocations[index].getTopologyPaths();
                if(allTopos.length == 0) {
                    allTopos = this.fakeRacks(blkLocations, index);
                }

                String[] arr$ = allTopos;
                int len$ = allTopos.length;

                for(int i$ = 0; i$ < len$; ++i$) {
                    String topo = arr$[i$];
                    Object node = clusterMap.getNode(topo);
                    if(node == null) {
                        node = new NodeBase(topo);
                        clusterMap.add((Node)node);
                    }

                    FileInputFormat.NodeInfo nodeInfo = (FileInputFormat.NodeInfo)hostsMap.get(node);
                    Node parentNode;
                    FileInputFormat.NodeInfo parentNodeInfo;
                    if(nodeInfo == null) {
                        nodeInfo = new FileInputFormat.NodeInfo((Node)node);
                        hostsMap.put(node, nodeInfo);
                        parentNode = ((Node)node).getParent();
                        parentNodeInfo = (FileInputFormat.NodeInfo)racksMap.get(parentNode);
                        if(parentNodeInfo == null) {
                            parentNodeInfo = new FileInputFormat.NodeInfo(parentNode);
                            racksMap.put(parentNode, parentNodeInfo);
                        }

                        parentNodeInfo.addLeaf(nodeInfo);
                    } else {
                        nodeInfo = (FileInputFormat.NodeInfo)hostsMap.get(node);
                        parentNode = ((Node)node).getParent();
                        parentNodeInfo = (FileInputFormat.NodeInfo)racksMap.get(parentNode);
                    }

                    nodeInfo.addValue(index, bytesInThisBlock);
                    parentNodeInfo.addValue(index, bytesInThisBlock);
                }
            }

            return this.identifyHosts(allTopos.length, racksMap);
        }
    }

    private String[] identifyHosts(int replicationFactor, Map<Node, FileInputFormat.NodeInfo> racksMap) {
        String[] retVal = new String[replicationFactor];
        LinkedList rackList = new LinkedList();
        rackList.addAll(racksMap.values());
        this.sortInDescendingOrder(rackList);
        boolean done = false;
        int index = 0;
        Iterator i$ = rackList.iterator();

        while(i$.hasNext()) {
            FileInputFormat.NodeInfo ni = (FileInputFormat.NodeInfo)i$.next();
            Set hostSet = ni.getLeaves();
            LinkedList hostList = new LinkedList();
            hostList.addAll(hostSet);
            this.sortInDescendingOrder(hostList);
            Iterator i$1 = hostList.iterator();

            while(i$1.hasNext()) {
                FileInputFormat.NodeInfo host = (FileInputFormat.NodeInfo)i$1.next();
                retVal[index++] = host.node.getName().split(":")[0];
                if(index == replicationFactor) {
                    done = true;
                    break;
                }
            }

            if(done) {
                break;
            }
        }

        return retVal;
    }

    private String[] fakeRacks(BlockLocation[] blkLocations, int index) throws IOException {
        String[] allHosts = blkLocations[index].getHosts();
        String[] allTopos = new String[allHosts.length];

        for(int i = 0; i < allHosts.length; ++i) {
            allTopos[i] = "/default-rack/" + allHosts[i];
        }

        return allTopos;
    }

    private static class NodeInfo {
        final Node node;
        final Set<Integer> blockIds;
        final Set<FileInputFormat.NodeInfo> leaves;
        private long value;

        NodeInfo(Node node) {
            this.node = node;
            this.blockIds = new HashSet();
            this.leaves = new HashSet();
        }

        long getValue() {
            return this.value;
        }

        void addValue(int blockIndex, long value) {
            if(this.blockIds.add(Integer.valueOf(blockIndex))) {
                this.value += value;
            }

        }

        Set<FileInputFormat.NodeInfo> getLeaves() {
            return this.leaves;
        }

        void addLeaf(FileInputFormat.NodeInfo nodeInfo) {
            this.leaves.add(nodeInfo);
        }
    }

    private static class MultiPathFilter implements PathFilter {
        private List<PathFilter> filters;

        public MultiPathFilter(List<PathFilter> filters) {
            this.filters = filters;
        }

        public boolean accept(Path path) {
            Iterator i$ = this.filters.iterator();

            PathFilter filter;
            do {
                if(!i$.hasNext()) {
                    return true;
                }

                filter = (PathFilter)i$.next();
            } while(filter.accept(path));

            return false;
        }
    }

    public static enum Counter {
        BYTES_READ;

        private Counter() {
        }
    }
}
</span>

 
比如我们需要定制每个spit的大小,我们且看FileinputFormat是如何定义分割大小的
    protected long computeSplitSize(long goalSize, long minSize, long blockSize) {
        return Math.max(minSize, Math.min(goalSize, blockSize));
    }
其实split的大小可以超过block的(这样会导致跨block读取导致网络堵塞)
 
 
如果你需要定制1024b的分割量,修改即可

杨过悔

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
inputFormat,RecordRead的理解和定制

先看看InputFormat接口和public interface InputFormat { InputSplit[] getSplits(JobConf var1, int var2) throws IOException; RecordReader getRecordReader(InputSplit var1, JobConf var2, Reporter va
复制链接

扫一扫