序列化
序列化是把结构化的对像转为字节流,以便网络传输或存储到磁盘设备上。反序列化是一个相反的过程,即把字节流转变为一系列的结构化对象。
RPC序列化建议的特性
1.紧凑(Compact)即方便网络传输,充分利用存储空间
2.快速(Fast)即序列化及反序列化性能要好
3.扩展性(Extensible)即协议有变化,可以支持新的需求
4.互操作性(Interoperable)即客户端及服务器端不依赖语言的实现
Hadoop使用Writables,满足紧凑、快速,不满足扩展能及互操作性
Writable 接口
package org.apache.hadoop.io;
import java.io.DataOutput;
import java.io.DataInput;
import java.io.IOException;
public interface Writable {
void write(DataOutput out) throws IOException;
void readFields(DataInput in) throws IOException;
}
package com.bigdata.io;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Writable;
public final class WritableHelper {
public static byte[] serialize(Writable writable) throws IOException{
ByteArrayOutputStream out = new ByteArrayOutputStream();
DataOutputStream dataOut = new DataOutputStream(out);
writable.write(dataOut);
dataOut.close();
return out.toByteArray();
}
public static byte[] deserialize(Writable writable , byte[] bytes) throws IOException{
ByteArrayInputStream in = new ByteArrayInputStream(bytes);
DataInputStream dataIn = new DataInputStream(in);
writable.readFields(dataIn);
dataIn.close();
return bytes;
}
public static void main(String[] args) throws IOException {
IntWritable writable = new IntWritable();
writable.set(163);
byte[] bytes = serialize(writable);
System.out.println(bytes.length+"," + Bytes.toInt(bytes));
deserialize(writable, bytes);
System.out.println(bytes.length+"," + Bytes.toInt(bytes));
}
}
WritableComparable and comparators
package org.apache.hadoop.io;
public interface WritableComparable<T> extends Writable,Comparable<T> {
}
package org.apache.hadoop.io;
import java.util.Comparator;
public interface RawComparator<T> extends Comparator<T> {
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2);
}
package com.bigdata.io;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparator;
public final class WritableHelper {
public static byte[] serialize(Writable writable) throws IOException{
ByteArrayOutputStream out = new ByteArrayOutputStream();
DataOutputStream dataOut = new DataOutputStream(out);
writable.write(dataOut);
dataOut.close();
return out.toByteArray();
}
public static byte[] deserialize(Writable writable , byte[] bytes) throws IOException{
ByteArrayInputStream in = new ByteArrayInputStream(bytes);
DataInputStream dataIn = new DataInputStream(in);
writable.readFields(dataIn);
dataIn.close();
return bytes;
}
public static void main(String[] args) throws IOException {
IntWritable writable = new IntWritable();
writable.set(163);
byte[] bytes = serialize(writable);
System.out.println(bytes.length+"," + Bytes.toInt(bytes));
deserialize(writable, bytes);
System.out.println(bytes.length+"," + Bytes.toInt(bytes));
RawComparator<IntWritable> comparator = WritableComparator.get(IntWritable.class);
IntWritable w1 = new IntWritable(163);
IntWritable w2 = new IntWritable(67);
int result = comparator.compare(w1, w2);
System.out.println(result);
byte[] b1 = serialize(w1);
byte[] b2 = serialize(w2);
result = comparator.compare(b1, 0, b1.length, b2, 0, b2.length);
System.out.println(result);
}
}
Java primitive | Writable Implementation | Serialized size(bytes) |
boolean | BooleanWritable | 1 |
byte | ByteWritable | 1 |
short | ShortWritable | 2 |
int | IntWritable | 4 |
VIntWritable | 1-5 | |
float | FloatWritable | 4 |
long | LongWritable | 8 |
VLongWritable | 1-9 | |
double | DoubleWritable | 8 |
Type | Description | Schema |
null | The absence of a value | "null" |
boolean | A binary value | "boolean" |
int | 32-bit singed integer | "int" |
long | 64-bit singed integer | "long" |
float | Single precision(32-bit) IEEE 754 floating-point number | "float" |
double | Double precision(64-bit) IEEE 754 floating-point number | "double" |
bytes | Sequence of 8-bit unsigned bytes | "bytes" |
string | Sequence of Unicode characters | "string" |
Type | Description | Schema example |
array | An ordered collection of objects. All objects in a particular array must have the same schema. | { "type":"array", "items":"long" } |
map | An unordered collection of key-value pairs.Keys must be strings, values may be any type, although within a particular map all values must have the same schema. | { "type":"map", "values":"string" |
record | A collection of named fields of any type. | { "type":'record", "name": "WeatherRecord", "doc":"A weather reading.", "fields":[ {"name":"year","type":"int"}, {"name":"temperature","type":"int"}, {"name":"stationId","type":"string"} ] } |
enum | A set of named values. | { "type":"enum", "name":"Cultery", "doc":"An eating utensil.", "symbols":["KNIFE","FORK","SPOON"] } |
fixed | A fixed number of 8-bit unsigned bytes. | { "type":"fixed", "name":"Md5Hash", "size":16 |
union | A union of schemas. A union is represented by a JSON array,where each element in the arry is a schema.Data represented by a union must match one of th eschemas in the union. | [ "null", "string", {"type":"map","values":"string"} ]
|
package com.bigdata.io.avro;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumReader;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.io.Decoder;
import org.apache.avro.io.DecoderFactory;
import org.apache.avro.io.Encoder;
import org.apache.avro.io.EncoderFactory;
public class StringPair {
/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
Schema.Parser parser = new Schema.Parser();
Schema schema = parser.parse(StringPair.class.getResourceAsStream("/StringPair.avsc"));
//We can create an instance of an Avro record using the generic API as follows
GenericRecord datum = new GenericData.Record(schema);
datum.put("left", "L");
datum.put("right", "R");
// we serialize the record to an output stream
ByteArrayOutputStream out = new ByteArrayOutputStream();
DatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(schema);
Encoder encoder = EncoderFactory.get().binaryEncoder(out, null);
writer.write(datum, encoder);
encoder.flush();
out.close();
DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(schema);
Decoder decoder = DecoderFactory.get().binaryDecoder(out.toByteArray(), null);
GenericRecord result = reader.read(null, decoder);
String r1 = result.get("left").toString();
String r2 = result.get("right").toString();
System.out.println(r1+ ","+r2);
}
}
{
"type": "record",
"name": "StringPair",
"doc": "A pair of strings.",
"fields": [
{"name":"left", "type": "string"},
{"name":"right", "type": "string"}
]
}
package com.bigdata.io.avro;
import java.io.File;
import java.io.IOException;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumReader;
import org.apache.avro.io.DatumWriter;
public class AvroWriteToFile {
public static void main(String[] args) throws IOException {
Schema.Parser parser = new Schema.Parser();
Schema schema = parser.parse(AvroWriteToFile.class.getResourceAsStream("/StringPair.avsc"));
GenericRecord datum = new GenericData.Record(schema);
datum.put("left", "L");
datum.put("right", "R");
File file = new File("data.avro");
DatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(schema);
DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(writer);
dataFileWriter.create(schema, file);
dataFileWriter.append(datum);
datum.put("left", "is left");
datum.put("right", "is right");
dataFileWriter.append(datum);
dataFileWriter.close();
DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(schema);
DataFileReader<GenericRecord> fileReader = new DataFileReader<GenericRecord>(file, reader);
GenericRecord result = null;
for (GenericRecord record : fileReader) {
System.out.println(record.get("left")+","+record.get("right"));
}
fileReader.sync(0);
System.out.println(fileReader.getBlockCount());
while(fileReader.hasNext()){
result = fileReader.next();
System.out.println(result.get("left")+","+result.get("right"));
}
fileReader.close();
}
}
StringPair.avsc 增加description,一定要设置默认值,这样原来的规范也可以使用此Schema,同时新的规范也可用。
{
"type": "record",
"name": "StringPair",
"doc": "A pair of strings.",
"fields": [
{"name":"left", "type": "string"},
{"name":"right", "type": "string"},
{"name": "description", "type": ["null", "string"], "default": null}
]
}