1
2
3
4
5
6
7
8
|
public
static
enum
CompressionType {
/** 不压缩 */
NONE,
/** 只压缩value */
RECORD,
/** 压缩很多记录的key/value成一块 */
BLOCK
}
|
1
|
public
static
class
Writer
implements
java.io.Closeable, Syncable
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
/** Write and flush the file header. */
private
void
writeFileHeader()
throws
IOException {
out.write(VERSION);
Text.writeString(out, keyClass.getName());
Text.writeString(out, valClass.getName());
out.writeBoolean(
this
.isCompressed());
out.writeBoolean(
this
.isBlockCompressed());
if
(
this
.isCompressed()) {
Text.writeString(out, (codec.getClass()).getName());
}
this
.metadata.write(out);
out.write(sync);
// write the sync bytes
out.flush();
// flush header
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
|
package
demo;
import
java.io.IOException;
import
java.net.URI;
import
org.apache.hadoop.conf.Configuration;
import
org.apache.hadoop.fs.FileSystem;
import
org.apache.hadoop.fs.Path;
import
org.apache.hadoop.io.IOUtils;
import
org.apache.hadoop.io.IntWritable;
import
org.apache.hadoop.io.SequenceFile;
import
org.apache.hadoop.io.SequenceFile.CompressionType;
import
org.apache.hadoop.io.Text;
public
class
SequenceFileWriteDemo {
private
static
final
String[] DATA = {
"One, two, buckle my shoe"
,
"Three, four, shut the door"
,
"Five, six, pick up sticks"
,
"Seven, eight, lay them straight"
,
"Nine, ten, a big fat hen"
};
public
static
void
main(String[] args)
throws
IOException {
String uri = args[
0
];
Configuration conf =
new
Configuration();
conf.set(
"fs.defaultFS"
,
"hdfs://xxx.xxx.xxx.xx:9000"
);
FileSystem fs = FileSystem.get(URI.create(uri), conf);
Path path =
new
Path(uri);
IntWritable key =
new
IntWritable();
Text value =
new
Text();
SequenceFile.Writer writer =
null
;
try
{
String compressType = args[
1
];
System.out.println(
"compressType "
+compressType);
// Writer : Uncompressed records.
if
(compressType.equals(
"1"
) ){
System.out.println(
"compress none"
);
writer = SequenceFile.createWriter(fs, conf, path, key.getClass(),value.getClass(),CompressionType.NONE);
}
else
if
(compressType .equals(
"2"
) ){
System.out.println(
"compress record"
);
//RecordCompressWriter : Record-compressed files, only compress values.
writer = SequenceFile.createWriter(fs, conf, path, key.getClass(),value.getClass(),CompressionType.RECORD);
}
else
if
(compressType.equals(
"3"
) ){
System.out.println(
"compress block"
);
// BlockCompressWriter : Block-compressed files, both keys & values are collected in 'blocks' separately and compressed. The size of the 'block' is configurable.
writer = SequenceFile.createWriter(fs, conf, path, key.getClass(),value.getClass(),CompressionType.BLOCK);
}
for
(
int
i =
0
; i <
100
; i++) {
key.set(
100
- i);
value.set(DATA[i % DATA.length]);
System.out.printf(
"[%s]\t%s\t%s\n"
, writer.getLength(), key,value);
writer.append(key, value);
}
}
finally
{
IOUtils.closeStream(writer);
}
}
}
|
1
2
3
4
|
private
static
byte
[] VERSION =
new
byte
[] {
(
byte
)
'S'
, (
byte
)
'E'
, (
byte
)
'Q'
, VERSION_WITH_METADATA
};
out.write(VERSION);
|
1
|
Text.writeString(out, keyClass.getName());
|
1
|
Text.writeString(out, valClass.getName());
|
1
2
|
out.writeBoolean(
this
.isCompressed());
out.writeBoolean(
this
.isBlockCompressed());
|
1
2
3
|
if
(
this
.isCompressed()) {
Text.writeString(out, (codec.getClass()).getName());
}
|
1
|
this
.metadata.write(out);
|
1
2
3
4
5
6
7
8
9
10
11
12
13
|
byte
[] sync;
// 16 random bytes
{
try
{
MessageDigest digester = MessageDigest.getInstance(
"MD5"
);
long
time = Time.now();
digester.update((
new
UID()+
"@"
+time).getBytes());
sync = digester.digest();
}
catch
(Exception e) {
throw
new
RuntimeException(e);
}
}
out.write(sync);
|
SequenceFile Header
-
version - 3 bytes of magic header SEQ, followed by 1 byte of actual version number (e.g. SEQ4 or SEQ6)
-
keyClassName -key class
-
valueClassName - value class
-
compression - A boolean which specifies if compression is turned on for keys/values in this file.
-
blockCompression - A boolean which specifies if block-compression is turned on for keys/values in this file.
-
compression codec -
CompressionCodec
class which is used for compression of keys and/or values (if compression is enabled). -
metadata -
Metadata
for this file. -
sync - A sync marker to denote end of the header.
1
2
3
4
5
6
7
8
|
public
synchronized
void
append(Object key, Object val){
.......
// Write the record out
checkAndWriteSync();
// sync
out.writeInt(buffer.getLength());
// total record length
out.writeInt(keyLength);
// key portion length
out.write(buffer.getData(),
0
, buffer.getLength());
// data
}
|
1
2
3
4
5
6
|
synchronized
void
checkAndWriteSync()
throws
IOException {
if
(sync !=
null
&&
out.getPos() >= lastSyncPos+SYNC_INTERVAL) {
// time to emit sync
sync();
}
}
|
1
2
3
4
5
|
private
static
final
int
SYNC_HASH_SIZE =
16
;
// number of bytes in hash
private
static
final
int
SYNC_SIZE =
4
+SYNC_HASH_SIZE;
// escape + hash
/** The number of bytes between sync points.*/
public
static
final
int
SYNC_INTERVAL =
100
*SYNC_SIZE;
|
Uncompressed SequenceFile Format
-
Record
-
Record length
-
Key length
-
Key
-
Value
-
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
@Override
@SuppressWarnings
(
"unchecked"
)
public
synchronized
void
append(Object key, Object val)
throws
IOException {
if
(key.getClass() != keyClass)
throw
new
IOException(
"wrong key class: "
+key.getClass().getName()
+
" is not "
+keyClass);
if
(val.getClass() != valClass)
throw
new
IOException(
"wrong value class: "
+val.getClass().getName()
+
" is not "
+valClass);
buffer.reset();
// Append the 'key'
keySerializer.serialize(key);
int
keyLength = buffer.getLength();
if
(keyLength <
0
)
throw
new
IOException(
"negative length keys not allowed: "
+ key);
// Compress 'value' and append it
deflateFilter.resetState();
compressedValSerializer.serialize(val);
deflateOut.flush();
deflateFilter.finish();
// Write the record out
checkAndWriteSync();
// sync
out.writeInt(buffer.getLength());
// total record length
out.writeInt(keyLength);
// key portion length
out.write(buffer.getData(),
0
, buffer.getLength());
// data
}
|
Record-Compressed SequenceFile Format
-
Record
-
Record length
-
Key length
-
Key
-
Compressed Value
-
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
public
synchronized
void
append(Object key, Object val)
throws
IOException {
if
(key.getClass() != keyClass)
throw
new
IOException(
"wrong key class: "
+key+
" is not "
+keyClass);
if
(val.getClass() != valClass)
throw
new
IOException(
"wrong value class: "
+val+
" is not "
+valClass);
// Save key/value into respective buffers
int
oldKeyLength = keyBuffer.getLength();
keySerializer.serialize(key);
int
keyLength = keyBuffer.getLength() - oldKeyLength;
if
(keyLength <
0
)
throw
new
IOException(
"negative length keys not allowed: "
+ key);
WritableUtils.writeVInt(keyLenBuffer, keyLength);
int
oldValLength = valBuffer.getLength();
uncompressedValSerializer.serialize(val);
int
valLength = valBuffer.getLength() - oldValLength;
WritableUtils.writeVInt(valLenBuffer, valLength);
// Added another key/value pair
++noBufferedRecords;
// Compress and flush?
int
currentBlockSize = keyBuffer.getLength() + valBuffer.getLength();
if
(currentBlockSize >= compressionBlockSize) {
sync();
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
public
synchronized
void
sync()
throws
IOException {
if
(noBufferedRecords >
0
) {
super
.sync();
// No. of records
WritableUtils.writeVInt(out, noBufferedRecords);
// Write 'keys' and lengths
writeBuffer(keyLenBuffer);
writeBuffer(keyBuffer);
// Write 'values' and lengths
writeBuffer(valLenBuffer);
writeBuffer(valBuffer);
// Flush the file-stream
out.flush();
// Reset internal states
keyLenBuffer.reset();
keyBuffer.reset();
valLenBuffer.reset();
valBuffer.reset();
noBufferedRecords =
0
;
}
|
Block-Compressed SequenceFile Format
-
Record Block
-
Uncompressed number of records in the block
-
Compressed key-lengths block-size
-
Compressed key-lengths block
-
Compressed keys block-size
-
Compressed keys block
-
Compressed value-lengths block-size
-
Compressed value-lengths block
-
Compressed values block-size
-
Compressed values block
-
-
A sync-marker every block.