//将目标目录的所有文件以文件名为key,内容为value放入SequenceFile中//第一个参数是需要打包的目录,第二个参数生成的文件路径和名称
private static void combineToSequenceFile(String[] args) throwsIOException {
String sourceDir= args[0];
String destFile= args[1];
List files =getFiles(sourceDir);
Configuration conf= newConfiguration();
FileSystem fs=FileSystem.get(conf);
Path destPath= newPath(destFile);if(fs.exists(destPath)) {
fs.delete(destPath,true);
}
FSDataInputStream in= null;
Text key= newText();
BytesWritable value= newBytesWritable();byte[] buff = new byte[4096];
SequenceFile.Writer writer= null;
SequenceFile.Writer.Option option1= SequenceFile.Writer.file(newPath(destFile));
SequenceFile.Writer.Option option2=SequenceFile.Writer.keyClass(key.getClass());
SequenceFile.Writer.Option option3=SequenceFile.Writer.valueClass(value.getClass());
SequenceFile.Writer.Option option4=SequenceFile.Writer.compression(SequenceFile.CompressionType.RECORD);try{
writer=SequenceFile.createWriter(conf, option1, option2, option3, option4);for (int i = 0; i < files.size(); i++) {
Path path= newPath(files.get(i).toString());
System.out.println("读取文件:" +path.toString());
key= newText(files.get(i).toString());
in=fs.open(path);//只能处理小文件,int最大只能表示到1个G的大小,实际上大文件放入SequenceFile也没有意义
int length = (int) fs.getFileStatus(path).getLen();byte[] bytes = new byte[length];//read最多只能读取65536的大小
int readLength =in.read(buff);int offset = 0;while (readLength > 0) {
System.arraycopy(buff,0, bytes, offset, readLength);
offset+=readLength;
readLength=in.read(buff);
}
System.out.println("file length:" + length + ",read length:" +offset);
value= newBytesWritable(bytes);
System.out.printf("[%s]\t%s\t%s\n", writer.getLength(), key, value.getLength());
writer.append(key, value);
}
}finally{
IOUtils.closeStream(in);
IOUtils.closeStream(writer);
IOUtils.closeStream(fs);
}
}