hdfs适合因为存储的是大文件,从寻址等消耗来看,更加适合于流式的方式操作文件,但是,hdfs并不是不能进行随机读写,hdfs也是支持随机读写的。
主要通过FSDataInputStream类读,通过FSDataOutputStream类写。下面是两个例子
例1:从本地读,写入到hdfs
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
public class HdfsWr {
/**
*
* @param uri 本地路径
* @param conf
* @param uri2 hdfs路径
* @throws IOException
*/
public void fun(String uri,Configuration conf,String uri2) throws IOException {
// TODO Auto-generated method stub
FileSystem fs = FileSystem.get(URI.create(uri), conf);
FileSystem fs2 = FileSystem.get(URI.create(uri2), conf);
//hdfs输入流
FSDataInputStream in = null;
//hdfs输出流
FSDataOutputStream out = null;
byte[] buffer=new byte[20];
try{
//通过这个方法在hdfs中创建一个文件
out = fs2.create(new Path(uri2));
in = fs.open(new Path(uri));
//随机读取,从100开始,读20个
in.read(100, buffer, 0, 20);
//然后写入hdfs中的文件
out.write(buffer);
out.flush();
}finally{
in.close();
out.close();
}
}
}
第二个,从hdfs读,写入本地
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
public class HdfsRw {
/**
*
* @param uri hdfs路径
* @param conf
* @param uri2 本地路径
* @throws IOException
*/
public void fun(String uri,Configuration conf,String uri2) throws IOException
{
FileSystem fs = FileSystem.get(URI.create(uri), conf);
FSDataInputStream in = null;
//写入本地,直接用普通的输出流
FileOutputStream out = new FileOutputStream(new File(uri2));
byte[] buffer=new byte[20];
try{
//写文件
in = fs.open(new Path(uri));
//读
in.read(100, buffer, 0, 20);
//写
out.write(buffer);
out.flush();
//还有一种写法,就是借助于IOUtils
// in.skip(100);
// IOUtils.copyBytes(in, fos, 20, 4096, false);
}finally{
out.close();
in.close();
}
}
}
ok