//这里使用SequenceFile的原因是有同步点,希望不会出现一条记录跨数据块的情况,我处理比较方便.
但是好像这个也会出现此类情况。。。。 只有改Writer了
package examples;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.SequenceFile.*;
import org.apache.hama.bsp.LineRecordReader;
import org.apache.hama.bsp.RecordReader;
public class FileFormatConvert {
public static void write2SequenceFile(Path inputFile, Path outputFile) {
//读取输入文件的所有记录,依次转化为SequenceFile格式输出
FileSystem hdfs;
RecordReader<LongWritable, Text> reader ;
Writer writer = null ;
conf.set("fs.default.name", "hdfs://localhost:9000/");
try {
hdfs = (DistributedFileSystem) FileSystem.get(conf);
writer = SequenceFile.createWriter(hdfs, conf, outputFile,
Text.class, Text.class,,CompressionType.NONE);
FSDataInputStream dis = hdfs.open(inputFile);
reader = new LineRecordReader(dis,0, hdfs.getFileStatus(inputFile).getLen() , conf) ;
LongWritable key = reader.createKey();
Text value = reader.createValue();
while(reader.next(key, value))
{
String[] keyValue = value.toString().split(" ") ;
writer.append(new Text(keyValue[0]), new Text(keyValue[1]));
}
} catch (IOException e) {
e.printStackTrace();
} finally {
IOUtils.closeStream(writer);
}
}
public static void main(String[] args) {
Path inputFile = new Path("hdfs://localhost:9000/test/test.txt") ;
Path outputFile = new Path("hdfs://localhost:9000/liuqiang2/test.seq") ;
write2SequenceFile(inputFile, outputFile) ;
}
static Configuration conf = new Configuration();
}