LazyObject
LazyObject 用一个字节数组和范围代表一个对象。可以代表任何对象。
public class LazyObject {
protected byte[] bytes;
protected int start;
protected int length;
protected LazyObject() {
bytes = null;
start = 0;
length = 0;
}
protected LazyObject(byte[] bytes, int start, int length) {
setAll(bytes, start, length);
}
protected void setAll(byte[] bytes, int start, int length) {
this.bytes = bytes;
this.start = start;
this.length = length;
}
}
LazyPrimitive
LazyPrimitive 是基本类型的通用 lazy 实现类,增加了 primitiveClass 字段。
public abstract class LazyPrimitive<T> extends LazyObject {
Class<T> primitiveClass;
protected LazyPrimitive(Class<T> primitiveClass) {
this.primitiveClass = primitiveClass;
}
/**
* Returns the actual primitive object represented by this LazyObject.
*/
public abstract T getPrimitiveObject();
}
LazyByte,LazyLong, LazyInteger, LazyString, LazyDouble, LazyShort
PrimitiveByte 提供了 Byte 的 lazy 实现,仅当调用 getPrimitiveObject
时,才反序列化。LazyLong, LazyInteger, LazyString, LazyDouble, LazyShort 的实现也都类似。
public class LazyByte extends LazyPrimitive<Byte> {
public LazyByte() {
super(Byte.class);
}
@Override
public Byte getPrimitiveObject() {
if (bytes == null) return null;
try {
// Slower method: convert to String and then convert to Integer
// return Byte.valueOf(LazyUtils.convertToString(bytes, start, length));
return Byte.valueOf(parseByte(bytes, start, length));
} catch (NumberFormatException e) {
return null;
}
}
/**
* Parses the string argument as if it was a byte value and returns the
* result. Throws NumberFormatException if the string does not represent a
* single byte quantity.
*
* @param bytes
* @param start
* @param length
* a UTF-8 encoded string representation of a single byte quantity.
* @return byte the value represented by the argument
* @throws NumberFormatException
* if the argument could not be parsed as a byte quantity.
*/
public static byte parseByte(byte[] bytes, int start, int length) throws NumberFormatException {
return parseByte(bytes, start, length, 10);
}
/**
* Parses the string argument as if it was a byte value and returns the
* result. Throws NumberFormatException if the string does not represent a
* single byte quantity. The second argument specifies the radix to use when
* parsing the value.
*
* @param bytes
* @param start
* @param length
* a UTF-8 encoded string representation of a single byte quantity.
* @param radix
* the radix to use when parsing.
* @return byte the value represented by the argument
* @throws NumberFormatException
* if the argument could not be parsed as a byte quantity.
*/
public static byte parseByte(byte[] bytes, int start, int length, int radix)
throws NumberFormatException {
int intValue = LazyInteger.parseInt(bytes, start, length, radix);
byte result = (byte) intValue;
if (result == intValue) {
return result;
}
throw new NumberFormatException();
}
}
LazyStruct
public class LazyStruct extends LazyObject {
private static Log LOG = LogFactory.getLog(LazyStruct.class.getName());
LazyObject[] fields;
boolean[] fieldIsPrimitive;
byte separator;
Text nullSequence;
boolean lastColumnTakesAll;
boolean parsed;
/**
* Create a new LazyStruct Object.
* @param fields The field LazyObjects
* @param separator The separator for delimiting the fields in the byte[]
* @param nullSequence The sequence for null value
* @param lastColumnTakesAll whether the additional fields should be all put into the last column
* in case the data contains more columns than the schema.
*/
public LazyStruct(LazyObject[] fields, byte separator,
Text nullSequence, boolean lastColumnTakesAll) {
this.fields = fields;
this.separator = separator;
this.nullSequence = nullSequence;
this.lastColumnTakesAll = lastColumnTakesAll;
parsed = false;
fieldIsPrimitive = new boolean[fields.length];
for(int i=0; i<fields.length; i++) {
fieldIsPrimitive[i] = (fields[i] instanceof LazyPrimitive);
}
}
/**
* Set the row data for this LazyStruct.
*/
protected void setAll(byte[] bytes, int start, int length) {
super.setAll(bytes, start, length);
parsed = false;
}
boolean missingFieldWarned = false;
boolean extraFieldWarned = false;
/**
* Parse the byte[] and fill each field.
*/
private void parse() {
int structByteEnd = start + length;
int fieldId = 0;
int fieldByteBegin = start;
int fieldByteEnd = start;
// Go through all bytes in the byte[]
while (fieldByteEnd <= structByteEnd) {
if (fieldByteEnd == structByteEnd || bytes[fieldByteEnd] == separator) {
// end of field reached
if (lastColumnTakesAll && fieldId == fields.length - 1) {
fieldByteEnd = structByteEnd;
}
// Test the length first so in most cases we avoid doing a byte[] comparison.
int fieldLength = fieldByteEnd - fieldByteBegin;
if (fieldLength == nullSequence.getLength()
&& LazyUtils.compare(bytes, fieldByteBegin, fieldLength,
nullSequence.getBytes(), 0, nullSequence.getLength()) == 0) {
fields[fieldId].setAll(null, 0, 0);
} else {
fields[fieldId].setAll(bytes, fieldByteBegin,
fieldByteEnd - fieldByteBegin);
}
fieldId ++;
if (fieldId == fields.length || fieldByteEnd == structByteEnd) {
// all fields have been parsed, or all bytes have been parsed
break;
}
fieldByteBegin = fieldByteEnd + 1;
}
fieldByteEnd++;
}
// Extra bytes at the end?
if (!extraFieldWarned && fieldByteEnd < structByteEnd) {
extraFieldWarned = true;
LOG.warn("Extra bytes detected at the end of the row! Ignoring similar problems.");
}
// Missing fields?
if (!missingFieldWarned && fieldId < fields.length) {
missingFieldWarned = true;
LOG.warn("Missing fields! Expected " + fields.length + " fields but only got "
+ fieldId + "! Ignoring similar problems.");
}
// Fill all missing fields with nulls.
for(; fieldId < fields.length; fieldId ++) {
fields[fieldId].setAll(null, 0, 0);
}
parsed = true;
}
/**
* Get one field out of the struct.
*
* If the field is a primitive field, return the actual object.
* Otherwise return the LazyObject. This is because PrimitiveObjectInspector
* does not have control over the object used by the user - the user simply
* directly use the Object instead of going through
* Object PrimitiveObjectInspector.get(Object).
*
* @param i the field ID
* @return the field as a LazyObject
*/
public Object getField(int i) {
if (!parsed) {
parse();
}
if (!fieldIsPrimitive[i]) {
return fields[i];
} else {
return ((LazyPrimitive)fields[i]).getPrimitiveObject();
}
}
}