在运行map和reduce任务时会需要从hdfs中读取数据,从linux磁盘中读取数据,这些数据往往存在于不同的节点上,这样就会产生IO网络消耗。Hadoop提供了org.apache.hadoop.io.Writable接口来实现序列化,相比于java的序列化hadoop的序列化消耗资源更少。
writable接口中存在两个方法
void write(DataOutputout) throws IOException;
voidreadFields(DataInput in) throws IOException;
write方法作用是序列化数据到DataOutPut中,readFileds方法的作用是从输入流中反序列化数据,需要我手动来完成。Java则不用只需要实现java.io.Serializable接口。下面就是一个hadoop的序列化例子。
package mapreduce;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import org.apache.hadoop.io.Writable;
import org.junit.Test;
public class TestSerExample {
@Test
public void testJava() throws Exception {
peopleJava p1 = new peopleJava(1L,"张三");
FileOutputStream fs = new FileOutputStream(new File("G:/BigData/java1"));
ObjectOutputStream os = new ObjectOutputStream(fs);
os.writeObject(p1);
os.close();
fs.close();
}
@Test
public void testHadoop() throws Exception {
peopleHadoop p1 = new peopleHadoop(1L, "张三");
FileOutputStream fs = new FileOutputStream(new File("G:/BigData/hadoop1"));
DataOutputStream dos = new DataOutputStream(fs);
p1.write(dos);
fs.close();
dos.close();
}
}
/**
* java 序列化
* @author think
*
*/
class peopleJava implements java.io.Serializable
{
private Long id;
private String name;
public peopleJava(Long id, String name) {
super();
this.id = id;
this.name = name;
}
}
/**
* hadoop序列化
* @author think
*
*/
class peopleHadoop implements Writable
{
private Long id;
private String name;
public peopleHadoop(Long id, String name) {
super();
this.id = id;
this.name = name;
}
/**
* 将数据序列化到DataOutput中
*/
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(id);
out.writeUTF(name);
}
/**
* 从DataInput中反序列化数据
*/
@Override
public void readFields(DataInput in) throws IOException {
}
}