Hadoop的HDFS和MapReduce子框架主要是针对大数据文件来设计的,在小文件的处理上不但效率低下,
而且十分消耗磁盘空间(每一个小文件占用一个Block,HDFS默认block大小为64M)。解决办法通常是选择一个容器,
将这些小文件组织起来统一存储。HDFS提供了两种类型的容器,分别是SequenceFile和MapFile。
1 SequenceFile使用
package org.tony.file;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.ReflectionUtils;
public class SequenceFileWriter {
public static String uri = "hdfs://192.168.142.128:9000";
public static String[] data = { "one,two", "three,four", "five,six",
"seven,eight", "night,ten" };
public static void main(String[] args) throws IOException {
write();
read();
}
/**
* @Title: write
* @Description: (1)创建Configuration;(2)获取FileSystem;(3)创建文件输出路径Path;
(4)调用SequenceFile.createWriter得到SequenceFile.Writer对象;
(5)调用SequenceFile.Writer.append追加写入文件;(6)关闭流;
* @return void 返回类型
* @throws
*/
public static void write() throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(uri), conf);
Path path = new Path("/tmp.seq");
// Path path = new Path("/tmp1.seq"); 采用压缩
IntWritable key = new IntWritable();
Text value = new Text();
SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, path,
key.getClass(), value.getClass());
// SequenceFile.Writer writer =
// SequenceFile.createWriter(fs,conf,path,key.getClass(),value.getClass(),CompressionType.RECORD,new
// BZip2Codec()); 采用压缩,用BZip2压缩算法
for (int i = 0; i < 100; i++) { // 写入100次
key.set(100 - i);
value.set(data[i % data.length]);
writer.append(key, value);
}
IOUtils.closeStream(writer);
}
/**
* @Title: read
* @Description: (1)创建Configuration;(2)获取FileSystem;(3)创建文件输出路径Path;
(4)new一个SequenceFile Reader进行读取;(5)得到keyClass和valueClass;(6)关闭流;
* @return void 返回类型
* @throws
*/
public static void read() throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(uri), conf);
Path path = new Path("/tmp.seq");
// Path path = new Path("/tmp1.seq"); 读取压缩文件
SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
Writable key = (Writable) ReflectionUtils.newInstance(
reader.getKeyClass(), conf);
Writable value = (Writable) ReflectionUtils.newInstance(
reader.getValueClass(), conf);
while (reader.next(key, value)) {
System.out.println("key = " + key);
System.out.println("value = " + value);
System.out.println("position = " + reader.getPosition());
}
IOUtils.closeStream(reader);
}
}
2 MapFile使用
package org.tony.file;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.util.ReflectionUtils;
public class MapFileTest {
public static String uri = "hdfs://192.168.142.128:9000"; //访问hdfs 的uri
public static String[] data = { "one,two", "three,four", "five,six",
"seven,eight", "night,ten" };
public static void main(String[] args) throws Exception {
// write();
read();
// seqToMapFile();
}
/**用Mapfile写文件
* @Title: write
* @Description: (1)创建Configuration;(2)获取FileSystem;(3)创建文件输出路径Path;
(4)new一个MapFile.Writer对象;(5)调用MapFile.Writer.append追加写入文件;(6)关闭流;
* @return void 返回类型
* @throws
*/
public static void write() throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(uri), conf);
Path path = new Path("/tmpdata.map");
IntWritable key = new IntWritable();
Text value = new Text();
MapFile.Writer writer = new MapFile.Writer(conf, fs, path.toString(),
key.getClass(), value.getClass());
for (int i = 0; i < 100; i++) {
key.set(i + 1);
value.set(data[i % data.length]);
writer.append(key, value);
}
IOUtils.closeStream(writer);
}
/**用Mapfile读文件
* @Title: read
* @Description: (1)创建Configuration;(2)获取FileSystem;(3)创建文件输出路径Path;
(4)new一个MapFile.Reader对象;(5)得到keyClass和valueClass;(6)关闭流;
* @return void 返回类型
* @throws
*/
public static void read() throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(uri), conf);
Path path = new Path("/tmpdata.map");
MapFile.Reader reader = new MapFile.Reader(fs, path.toString(), conf);
WritableComparable key = (WritableComparable) ReflectionUtils
.newInstance(reader.getKeyClass(), conf);
Writable value = (Writable) ReflectionUtils.newInstance(
reader.getValueClass(), conf);
while (reader.next(key, value)) {
System.out.println("key = " + key);
System.out.println("value = " + value);
}
IOUtils.closeStream(reader);
}
/**将sequence文件转换为MapFile文件
* @Title: seqToMapFile
* @Description: TODO
* @return void 返回类型
* @throws
* 1 创建tmp1.map文件夹
* 2 复制tmp1.seq SequenceFile文件到tmp1.map文件夹下面,并重命名data $./hadoop fs -mv /tmp1.seq /tmp1.map/data
* 3 运行程序
*/
public static void seqToMapFile() throws Exception{
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(uri), conf);
Path map = new Path("/tmp1.map"); //文件夹位置
//MapFile.DATA_FILE_NAME 为seq文件移动到tmp1.map文件夹下面的文件名称
Path mapData = new Path(map,MapFile.DATA_FILE_NAME);
SequenceFile.Reader reader = new SequenceFile.Reader(fs,mapData,conf);
Class key = reader.getKeyClass();
Class value = reader.getValueClass();
reader.close();
long entries = MapFile.fix(fs, map, key, value, false, conf);
System.out.printf("Created MapFile %s with %d entries\n",map,entries);
}
}