Hadoop如何读取复杂格式的文件，例如XML、HTML、图像等，附源码

最新推荐文章于 2021-05-02 09:30:18 发布

libobo5954451

最新推荐文章于 2021-05-02 09:30:18 发布

阅读量3.3k

点赞数 1

文章标签： hadoop html xml path file permissions

本文链接：https://blog.csdn.net/libobo5954451/article/details/6650655

版权

Hadoop读取复杂格式的文件是个大问题，之前除了sequencefile和textfile，就不能支持其他的文件了，现在我改写了这个FileFormat和RecordReader，不光可以支持XML，也同样可以使用HTML，图像(复杂格式的也可以支持)等。整个分为4个文件

FileID.java

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.pack;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;

/**
 * The class represents a document id, which is of type text.
 */
public class FileID implements WritableComparable {
  private final Text docID;

  /**
   * Constructor.
   */
  public FileID(Text temp) {
    
	  
//	  docID = new Text();
  
	String temp_str =  temp.toString();
	
	//String temp_arr[] = temp_str.split("/");
	
	//docID = new Text(temp_arr[temp_arr.length-1]);
	
	docID = new Text(temp);
  }

  /**
   * The text of the document id.
   * @return the text
   */
  public Text getText() {
    return docID;
  }

  /* (non-Javadoc)
   * @see java.lang.Comparable#compareTo(java.lang.Object)
   */
  public int compareTo(Object obj) {
    if (this == obj) {
      return 0;
    } else {
      return docID.compareTo(((FileID) obj).docID);
    }
  }

  /* (non-Javadoc)
   * @see java.lang.Object#hashCode()
   */
  public int hashCode() {
    return docID.hashCode();
  }

  /* (non-Javadoc)
   * @see java.lang.Object#toString()
   */
  public String toString() {
    return this.getClass().getName() + "[" + docID + "]";
  }

  /* (non-Javadoc)
   * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
   */
  public void write(DataOutput out) throws IOException {
    throw new IOException(this.getClass().getName()
        + ".write should never be called");
  }

  /* (non-Javadoc)
   * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
   */
  public void readFields(DataInput in) throws IOException {
    throw new IOException(this.getClass().getName()
        + ".readFields should never be called");
  }
}

MyFileStatus.java

package org.pack;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;

public class MyFileStatus extends FileStatus{
	public FSDataInputStream fsInputStream;//change into public from private
	
	
	public FSDataInputStream getInputStream(){
		return fsInputStream;
	}
			
	public void setInputStream(FSDataInputStream in){
		this.fsInputStream = in;
	}
}

XMLFileInputFormat.java

package org.pack;

import java.io.*;

import java.util.ArrayList;


import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.compress.*;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobConfigurable;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;

public class XMLFileInputFormat extends FileInputFormat<FileID, MyFileStatus>
  implements JobConfigurable {

  private CompressionCodecFactory compressionCodecs = null;
  
  public void configure(JobConf conf) {
    compressionCodecs = new CompressionCodecFactory(conf);
  }
  
  protected boolean isSplitable(FileSystem fs, Path file) {
    return false;
  }

  /** Splits files returned by {@link #listStatus(JobConf)} when
   * they're too big.*/ 
  public InputSplit[] getSplits(JobConf job, int numSplits)
    throws IOException {
    FileStatus[] files = listStatus(job);
    
    long totalSize = 0;                           // compute total size
    for (FileStatus file: files) {                // check we have valid files
      if (file.isDir()) {
        throw new IOException("Not a file: "+ file.getPath());
      }
      totalSize += file.getLen();
    }

    // generate splits
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
    
    for (FileStatus file: files) {
      Path path = file.getPath();
      FileSystem fs = path.getFileSystem(job);
      long length = file.getLen();
      BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
			
			if (length != 0) {
        splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
      } else { 
        //Create empty hosts array for zero length files
        splits.add(new FileSplit(path, 0, length, new String[0]));
      }
    }
    LOG.debug("Total # of splits: " + splits.size());
    return splits.toArray(new FileSplit[splits.size()]);
  }

  public RecordReader<FileID, MyFileStatus> getRecordReader(
                                          InputSplit split, 
                                          JobConf job,
                                          Reporter reporter)
    throws IOException{
      reporter.setStatus(split.toString());
      return new XMLFileRecordReader(job, ((FileSplit) split).getPath());
  }
}

XMLFileRecordReader.java

package org.pack;

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.RecordReader;


public class XMLFileRecordReader implements RecordReader<FileID, MyFileStatus> {
  private Path filePath;
  private FSDataInputStream fsInputStream;
  
  public Text temp;
  
  public XMLFileRecordReader(Configuration job, Path p) throws IOException{
		filePath = p;
		temp = new Text(p.toString());
		FileSystem fs = filePath.getFileSystem(job);
	  fsInputStream = fs.open(filePath);		 
  }
    
  /** 
   * Reads the next key/value pair from the input for processing.
   *
   * @param key the key to read data into
   * @param value the value to read data into
   * @return true iff a key/value was read, false if at EOF
   */      
  public boolean next(FileID k, MyFileStatus value) throws IOException{
  	if (filePath != null) {
  		value.setInputStream(fsInputStream);
      filePath = null;
      return true;
    }
    return false;
  }
  /**
   * Create an object of the appropriate type to be used as a key.
   * 
   * @return a new key object.
   */
  public FileID createKey(){
  	return new FileID(temp);
  }
  
  /**
   * Create an object of the appropriate type to be used as a value.
   * 
   * @return a new value object.
   */
  public MyFileStatus createValue(){
  	return new MyFileStatus();
  }

  /** 
   * Returns the current position in the input.
   * 
   * @return the current position in the input.
   * @throws IOException
   */
  public long getPos() throws IOException{
  	return 0;
  }

  /** 
   * Close this {@link InputSplit} to future operations.
   * 
   * @throws IOException
   */ 
  public void close() throws IOException{}

  /**
   * How much of the input has the {@link RecordReader} consumed i.e.
   * has been processed by?
   * 
   * @return progress from <code>0.0</code> to <code>1.0</code>.
   * @throws IOException
   */
  public float getProgress() throws IOException{
  	return 0.0f;
  }
}