MapFile
是排序后的SequenceFile,MapFile由两部分组成,分别是data和index。
index
文件的数据索引,主要记录了每个Record的key值,以及该Record在文件中的偏移位置。在MapFile被访问的时候,索引文件会被加载到内存,通过索引映射关系可迅速定位到指定Record所在文件位置,因此,相对SequenceFile而言,MapFile的检索效率是高效的,缺点是会消耗一部分内存来存储index数据,因为读取的时候,一整个的index都会被读取到内存中,所以在实现key的时候,要使数据尽可能的小。
读、写源码:
package org.apache.hadoop.io;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.ReflectionUtils;
public class THT_testMapFileWrite1 {
private static final String[] DATA = { "One, two, buckle my shoe",
"Three, four, shut the door", "Five, six, pick up sticks",
"Seven, eight, lay them straight", "Nine, ten, a big fat hen" };
public static void main(String[] args) throws IOException {
// String uri = args[0];
String uri = "file:///D://tmp//map1";
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(uri), conf);
IntWritable key = new IntWritable();
Text value = new Text();
MapFile.Writer writer = null;
try {
writer = new MapFile.Writer(conf, fs, uri, key.getClass(),
value.getClass());
for (int i = 0; i < 10; i++) {
key.set(i + 1);
value.set(DATA[i % DATA.length]);
writer.append(key, value);
}
} finally {
IOUtils.closeStream(writer);
}
MapFile.Reader reader = null;
reader = new MapFile.Reader(fs, uri, conf);
Writable keyR = (Writable) ReflectionUtils.newInstance(
reader.getKeyClass(), conf);
Writable valueR = (Writable) ReflectionUtils.newInstance(
reader.getValueClass(), conf);
while (reader.next(key, value)) {
System.out.printf("%s\t%s\n", key, value);
}
}
}
运行结果:
2015-11-08 11:46:09,532 INFO compress.CodecPool (CodecPool.java:getDecompressor(181)) - Got brand-new decompressor [.deflate]
1 One, two, buckle my shoe
2 Three, four, shut the door
3 Five, six, pick up sticks
4 Seven, eight, lay them straight
5 Nine, ten, a big fat hen
6 One, two, buckle my shoe
7 Three, four, shut the door
8 Five, six, pick up sticks
9 Seven, eight, lay them straight
10 Nine, ten, a big fat hen
生成一个文件夹,文件夹中有两个文件index、data:
index内容:
data内容:
重建索引(index)
这里首先将刚才生成的index文件删除掉,上源码:
package org.apache.hadoop.io;
//cc MapFileFixer Re-creates the index for a MapFile
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.Reader;
import org.apache.hadoop.util.ReflectionUtils;
//vv MapFileFixer
public class THT_testMapFileFix {
public static void main(String[] args) throws Exception {
// String mapUri = args[0];
String mapUri = "file:///D://tmp//map1";
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(mapUri), conf);
Path map = new Path(mapUri);
Path mapData = new Path(map, MapFile.DATA_FILE_NAME);
// Get key and value types from data sequence file
SequenceFile.Reader reader = new SequenceFile.Reader(fs, mapData, conf);
Class keyClass = reader.getKeyClass();
Class valueClass = reader.getValueClass();
reader.close();
// Create the map file index file
long entries = MapFile.fix(fs, map, keyClass, valueClass, false, conf);
System.out.printf("Created MapFile %s with %d entries\n", map, entries);
Path path = new Path(mapUri+"//data");
SequenceFile.Reader.Option option1 = Reader.file(path);
SequenceFile.Reader reader1 = null;
try {
reader1 = new SequenceFile.Reader(conf, option1);
Writable key = (Writable) ReflectionUtils.newInstance(
reader1.getKeyClass(), conf);
Writable value = (Writable) ReflectionUtils.newInstance(
reader1.getValueClass(), conf);
long position = reader1.getPosition();
while (reader1.next(key, value)) {
String syncSeen = reader1.syncSeen() ? "*" : "";
System.out.printf("[%s%s]\t%s\t%s\n", position, syncSeen, key,
value);
position = reader1.getPosition(); // beginning of next record
}
} finally {
IOUtils.closeStream(reader1);
}
}
}
// ^^ MapFileFixer
运行结果如下:
2015-11-08 12:16:34,015 INFO compress.CodecPool (CodecPool.java:getCompressor(153)) - Got brand-new compressor [.deflate]
Created MapFile file:/D:/tmp/map1 with 10 entries
[128] 1 One, two, buckle my shoe
[173] 2 Three, four, shut the door
[220] 3 Five, six, pick up sticks
[264] 4 Seven, eight, lay them straight
[314] 5 Nine, ten, a big fat hen
[359] 6 One, two, buckle my shoe
[404] 7 Three, four, shut the door
[451] 8 Five, six, pick up sticks
[495] 9 Seven, eight, lay them straight
[545] 10 Nine, ten, a big fat hen