Hadoop读取复杂格式的文件是个大问题,之前除了sequencefile和textfile,就不能支持其他的文件了,现在我改写了这个FileFormat和RecordReader,不光可以支持XML,也同样可以使用HTML,图像(复杂格式的也可以支持)等。整个分为4个文件
FileID.java
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.pack;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
/**
* The class represents a document id, which is of type text.
*/
public class FileID implements WritableComparable {
private final Text docID;
/**
* Constructor.
*/
public FileID(Text temp) {
// docID = new Text();
String temp_str = temp.toString();
//String temp_arr[] = temp_str.split("/");
//docID = new Text(temp_arr[temp_arr.length-1]);
docID = new Text(temp);
}
/**
* The text of the document id.
* @return the text
*/
public Text getText() {
return docID;
}
/* (non-Javadoc)
* @see java.lang.Comparable#compareTo(java.lang.Object)
*/
public int compareTo(Object obj) {
if (this == obj) {
return 0;
} else {
return docID.compareTo(((FileID) obj).docID);
}
}
/* (non-Javadoc)
* @see java.lang.Object#hashCode()
*/
public int hashCode() {
return docID.hashCode();
}
/* (non-Javadoc)
* @see java.lang.Object#toString()
*/
public String toString() {
return this.getClass().getName() + "[" + docID + "]";
}
/* (non-Javadoc)
* @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
*/
public void write(DataOutput out) throws IOException {
throw new IOException(this.getClass().getName()
+ ".write should never be called");
}
/* (non-Javadoc)
* @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
*/
public void readFields(DataInput in) throws IOException {
throw new IOException(this.getClass().getName()
+ ".readFields should never be called");
}
}
MyFileStatus.java
package org.pack;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
public class MyFileStatus extends FileStatus{
public FSDataInputStream fsInputStream;//change into public from private
public FSDataInputStream getInputStream(){
return fsInputStream;
}
public void setInputStream(FSDataInputStream in){
this.fsInputStream = in;
}
}
XMLFileInputFormat.java
package org.pack;
import java.io.*;
import java.util.ArrayList;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.compress.*;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobConfigurable;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
public class XMLFileInputFormat extends FileInputFormat<FileID, MyFileStatus>
implements JobConfigurable {
private CompressionCodecFactory compressionCodecs = null;
public void configure(JobConf conf) {
compressionCodecs = new CompressionCodecFactory(conf);
}
protected boolean isSplitable(FileSystem fs, Path file) {
return false;
}
/** Splits files returned by {@link #listStatus(JobConf)} when
* they're too big.*/
public InputSplit[] getSplits(JobConf job, int numSplits)
throws IOException {
FileStatus[] files = listStatus(job);
long totalSize = 0; // compute total size
for (FileStatus file: files) { // check we have valid files
if (file.isDir()) {
throw new IOException("Not a file: "+ file.getPath());
}
totalSize += file.getLen();
}
// generate splits
ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
for (FileStatus file: files) {
Path path = file.getPath();
FileSystem fs = path.getFileSystem(job);
long length = file.getLen();
BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
if (length != 0) {
splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
} else {
//Create empty hosts array for zero length files
splits.add(new FileSplit(path, 0, length, new String[0]));
}
}
LOG.debug("Total # of splits: " + splits.size());
return splits.toArray(new FileSplit[splits.size()]);
}
public RecordReader<FileID, MyFileStatus> getRecordReader(
InputSplit split,
JobConf job,
Reporter reporter)
throws IOException{
reporter.setStatus(split.toString());
return new XMLFileRecordReader(job, ((FileSplit) split).getPath());
}
}
XMLFileRecordReader.java
package org.pack;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.RecordReader;
public class XMLFileRecordReader implements RecordReader<FileID, MyFileStatus> {
private Path filePath;
private FSDataInputStream fsInputStream;
public Text temp;
public XMLFileRecordReader(Configuration job, Path p) throws IOException{
filePath = p;
temp = new Text(p.toString());
FileSystem fs = filePath.getFileSystem(job);
fsInputStream = fs.open(filePath);
}
/**
* Reads the next key/value pair from the input for processing.
*
* @param key the key to read data into
* @param value the value to read data into
* @return true iff a key/value was read, false if at EOF
*/
public boolean next(FileID k, MyFileStatus value) throws IOException{
if (filePath != null) {
value.setInputStream(fsInputStream);
filePath = null;
return true;
}
return false;
}
/**
* Create an object of the appropriate type to be used as a key.
*
* @return a new key object.
*/
public FileID createKey(){
return new FileID(temp);
}
/**
* Create an object of the appropriate type to be used as a value.
*
* @return a new value object.
*/
public MyFileStatus createValue(){
return new MyFileStatus();
}
/**
* Returns the current position in the input.
*
* @return the current position in the input.
* @throws IOException
*/
public long getPos() throws IOException{
return 0;
}
/**
* Close this {@link InputSplit} to future operations.
*
* @throws IOException
*/
public void close() throws IOException{}
/**
* How much of the input has the {@link RecordReader} consumed i.e.
* has been processed by?
*
* @return progress from <code>0.0</code> to <code>1.0</code>.
* @throws IOException
*/
public float getProgress() throws IOException{
return 0.0f;
}
}