MapReduce编程模型之InputFormat分析(二)

最新推荐文章于 2021-03-04 14:33:49 发布

OnlyOne_2014

最新推荐文章于 2021-03-04 14:33:49 发布

阅读量486

点赞数

分类专栏： Hadoop

本文链接：https://blog.csdn.net/OnlyOne_2014/article/details/40186535

版权

Hadoop 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

所有基于文件也就是基于HDFS的InputFormat的实现的基类都是FileInputFormat,在这里面我们实现了getSplit方法,而其后的具体实现类则各自实现自己的RecordReader.用于从分片中解析出键值对.

InputSplit抽象类源代码

<span style="font-size:14px;color:#000000;">public abstract class InputFormat<K, V> {

  /** 
   * Logically split the set of input files for the job.  
   * 
   * <p>Each {@link InputSplit} is then assigned to an individual {@link Mapper}
   * for processing.</p>
   *
   * <p><i>Note</i>: The split is a <i>logical</i> split of the inputs and the
   * input files are not physically split into chunks. For e.g. a split could
   * be <i><input-file-path, start, offset></i> tuple. The InputFormat
   * also creates the {@link RecordReader} to read the {@link InputSplit}.
   * 
   * @param context job configuration.
   * @return an array of {@link InputSplit}s for the job.
   */
  public abstract 
    List<InputSplit> getSplits(JobContext context
                               ) throws IOException, InterruptedException;
  
  /**
   * Create a record reader for a given split. The framework will call
   * {@link RecordReader#initialize(InputSplit, TaskAttemptContext)} before
   * the split is used.
   * @param split the split to be read
   * @param context the information about the task
   * @return a new record reader
   * @throws IOException
   * @throws InterruptedException
   */
  public abstract 
    RecordReader<K,V> createRecordReader(InputSplit split,
                                         TaskAttemptContext context
                                        ) throws IOException, 
                                                 InterruptedException;

}</span>

FileInputFormat的getSplit方法解析

<span style="font-size:14px;color:#000000;">public List<InputSplit> getSplits(JobContext job
                                    ) throws IOException {

 
//计算文件切分的最小值,由配置参数mapred.min.split.size确定,默认是1字节,其设定的值必须比1大
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
//计算文件切分的最大值,由配置参数mapred.max.split.size确定
    long maxSize = getMaxSplitSize(job);

    // generate splits 生成InputSplit
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus>files = listStatus(job);
    for (FileStatus file: files) {
      Path path = file.getPath();
      FileSystem fs = path.getFileSystem(job.getConfiguration());
      long length = file.getLen();
      BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
      if ((length != 0) && isSplitable(job, path)) { 
</span><pre name="code" class="java"><span style="font-size:14px;color:#000000;">//文件切分算法,用于确定InputSplit的个数和每个InputSplit对应的数据段.
//计算InputSplit的方法是 splitSize=max{minSize,min{maxSize,blockSize}}</span>
        long blockSize = file.getBlockSize();
        long splitSize = computeSplitSize(blockSize, minSize, maxSize);

//host选择算法,确定每个InputSplit的元数据信息,这通常由四部分组成<file,start,length,hosts>表示InputSplit
//所在的文件,起始位置,长度和host节点列表

 long bytesRemaining = length;
        while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) {
//计算该分片所在块的块索引
          int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);
          splits.add(new FileSplit(path, length-bytesRemaining, splitSize, 
                                   blkLocations[blkIndex].getHosts()));
          bytesRemaining -= splitSize;
        }
        
        if (bytesRemaining != 0) {
          splits.add(new FileSplit(path, length-bytesRemaining, bytesRemaining, 
                     blkLocations[blkLocations.length-1].getHosts()));
        }
      } else if (length != 0) {
        splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
      } else { 
        //Create empty hosts array for zero length files
        splits.add(new FileSplit(path, 0, length, new String[0]));
      }
    }
    
    // Save the number of input files in the job-conf
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());

    LOG.debug("Total # of splits: " + splits.size());
    return splits;
  }

在Host选择算法中,当InputSplit尺寸大于Block尺寸时,MapTask并不能实现完全的数据本地性,也就是说,总有一部分数据需要从远程节点获取

自定义InputFormat---ComputeIntensiveTextInputFormat实现

基于HDFS的InputSplit一般扩展FileInputFormat类,然后自己实现RecordReader用于实现从分片数据中解析出KV键值对

这个例子主要是展现如何编写自定义的InputFormat,这个InputFormat直接继承自已实现的TextInputFormat,主要覆盖getSplit方法,展示如何为计算密集型应用分配host,不考虑数据的本地性,而是考虑计算的均匀性.将map任务均衡的分配给各个节点.

<span style="font-size:14px;color:#000000;">public class ComputeIntensiveTextInputFormat extends TextInputFormat{

	private static final double SPLIT_SLOP = 1.1; 
	static final String NUM_INPUT_FILES = "mapreduce.input.num.files";
	@Override
	public List<InputSplit> getSplits(JobContext arg0) throws IOException {
		// TODO Auto-generated method stub
		long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(arg0));
	    long maxSize = getMaxSplitSize(arg0);
	    
	    //这里获取集群可用服务器列表,然后为每个输入分片均匀分配服务器
		String[] servers=getActiveServerList(arg0);
		if(servers==null)
			return null;
		List<InputSplit> splits=new ArrayList<InputSplit>();
		List<FileStatus> files=listStatus(arg0);
		int currentServer=0;
		for(FileStatus file:files){
			Path path=file.getPath();
			long length=file.getLen();
			if(length!=0&&isSplitable(arg0, path)){
				long blockSize=file.getBlockSize();
				long splitSize=computeSplitSize(blockSize, minSize, maxSize);
				
				long bytesRemaining=length;
				while((double)bytesRemaining/splitSize>SPLIT_SLOP){
					splits.add(new FileSplit(path, length-bytesRemaining, splitSize, new String[]{servers[currentServer]}));
					currentServer=getNextServer(currentServer, servers.length);
					bytesRemaining-=splitSize;
				}
			}else if(length!=0){
				splits.add(new FileSplit(path, 0, length, new String[]{servers[currentServer]}));
				currentServer=getNextServer(currentServer, servers.length);
			}else{
				splits.add(new FileSplit(path, 0, length, new String[0]));
			}
		}
		
		arg0.getConfiguration().setLong(NUM_INPUT_FILES,splits.size());
		return splits;
	}

	//获取集群中可用服务器的列表
	private String[] getActiveServerList(JobContext context){
		String[] servers=null;
		
		try{
			JobClient jc=new JobClient((JobConf) context.getConfiguration());
			ClusterStatus status=jc.getClusterStatus(true);
			Collection<String> atc=status.getActiveTrackerNames();
			servers=new String[atc.size()];
			int s=0;
			for(String serverInfo:atc){
				StringTokenizer st=new StringTokenizer(serverInfo,":");
				String trackerName=st.nextToken();
				StringTokenizer st1=new StringTokenizer(trackerName, "_");
				st1.nextToken();
				servers[s++]=st1.nextToken();
			}
		}catch (IOException e) {
			// TODO: handle exception
			e.printStackTrace();
		}
		return servers;
	}
	
	private static int getNextServer(int current,int max){
		current++;
		if(current>=max)
			current=0;
		return current;
	}
	
}</span>

自定义InputFormat---FileLis tInputFormat实现
自定义InputSplit

public class MultiFileSplit extends InputSplit implements Writable{		
	private long length;
	private String[] hosts;
	private List<String> files=null;
	public void readFields(DataInput arg0) throws IOException {
		// TODO Auto-generated method stub
		length=arg0.readLong();
		hosts=new String[arg0.readInt()];
		for(int i=0;i<hosts.length;i++)
			hosts[i]=arg0.readUTF();
		int length=arg0.readInt();
		files=new ArrayList<String>();
		for(int i=0;i<length;i++)
			files.add(arg0.readUTF());
		
	}
	public void write(DataOutput arg0) throws IOException {
		// TODO Auto-generated method stub
		arg0.writeLong(length);
		arg0.writeInt(hosts.length);
		for(String host:hosts)
			arg0.writeUTF(host);
	    arg0.write(files.size());
	    for(String file:files)
	    	arg0.writeUTF(file);
	}
	@Override
	public long getLength() throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		return length;
	}
	@Override
	public String[] getLocations() throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		return hosts;
	}
	public MultiFileSplit(long length, String[] hosts) {
		super();
		this.length = length;
		this.hosts = hosts;
	}
	public MultiFileSplit() {
	
	}
        public void addFile(Path path){
               files.add(path.toString());
        }
        public List<String> getFiles(){
        return files;
    }
 }

自定义FileListInputFormat实现

<span style="font-size:14px;">public class FileListInputFormat extends FileInputFormat<Text, Text>{

	
	
	private static final String MAPCOUNT="map.reduce.map.count";
	private static final String INPUTPATH="mapred.input.dir";
	
	@Override
	public List<InputSplit> getSplits(JobContext arg0) throws IOException {
		// TODO Auto-generated method stub
		Configuration conf=arg0.getConfiguration();
		String fileName=conf.get(INPUTPATH, "");
		String[] hosts=getActiveServerList(arg0);
		Path p=new Path(StringUtils.unEscapeString(fileName));
		List<InputSplit> splits=new LinkedList<InputSplit>();
		FileSystem fs=p.getFileSystem(conf);
		int mappers=0;
		
		mappers=conf.getInt(MAPCOUNT,0);
		if(mappers==0)
			throw new IOException("Number of mappers is not specified");
		FileStatus[] files=fs.globStatus(p);
		int nfiles=files.length;
		if(nfiles<mappers)
			mappers=nfiles;
		for(int i=0;i<mappers;i++)
			splits.add(new MultiFileSplit(0, hosts));
		Iterator<InputSplit> siter=splits.iterator();
		for(FileStatus f:files){
			if(!siter.hasNext())
				siter=splits.iterator();
			((MultiFileSplit)(siter.next())).addFile(f.getPath().toUri().getPath());
		}		
		return splits;
	}

	@Override
	public RecordReader<Text, Text> createRecordReader(InputSplit arg0,
			TaskAttemptContext arg1) throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		return null;
	}
	
	@SuppressWarnings("unused")
	private static void setMapCount(Job job,int mappers){
		Configuration conf=job.getConfiguration();
		conf.setInt(MAPCOUNT, mappers);
	}
	
	//获取集群中可用服务器的列表
	private String[] getActiveServerList(JobContext context){
		String[] servers=null;
		
		try{
			JobClient jc=new JobClient((JobConf) context.getConfiguration());
			ClusterStatus status=jc.getClusterStatus(true);
			Collection<String> atc=status.getActiveTrackerNames();
			servers=new String[atc.size()];
			int s=0;
			for(String serverInfo:atc){
				StringTokenizer st=new StringTokenizer(serverInfo,":");
				String trackerName=st.nextToken();
				StringTokenizer st1=new StringTokenizer(trackerName, "_");
				st1.nextToken();
				servers[s++]=st1.nextToken();
			}
		}catch (IOException e) {
			// TODO: handle exception
			e.printStackTrace();
		}
		return servers;
	}

}</span>

参考书籍:

Hadoop技术内幕深入理解MapReduce架构设计与实现原理

Hadoop高级编程---构建与实现大数据解决方案

Hadoop权威指南第2版

Hadoop实战