import java.io.ByteArrayOutputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger;
//hdfs 文件操作工具类,从任意的hdfs filepath中读取文本内容
public class HdfsFileOperatorUtil {
Logger logger = Logger.getLogger(HdfsFileOperatorUtil.class);//添加日志
// 加载配置文件到内存对象
static Configuration hadoopConf = new Configuration();
// 从HDFS上读取文件
public static String readFromFile(String srcFile) throws Exception {
//文件路径的空判断
if (srcFile == null || srcFile.trim().length() == 0) {
throw new Exception("所要读取的源文件不存在");
}
//获取hadoopConf对应的hdfs集群的对象引用
FileSystem fs = FileSystem.get(hadoopConf);
Path hdfsPath=new Path(srcFile);
FSDataInputStream hdfsInStream = fs.open(hdfsPath);
//初始化一块字节数组缓冲区,大小为65536。缓存每次从流中读取出来的字节数组
byte[] byteArray = new byte[65536];
//初始化字节数输出流, 存放最后的所有字节数组
ByteArrayOutputStream bos = new ByteArrayOutputStream();
// 实际读过来多少
int readLen = 0;
while ((readLen = hdfsInStream.read(byteArray)) > 0) {
bos.write(byteArray);
byteArray = new byte[65536];
}
hdfsInStream.close();
//将utf-8编码的字节数组通过utf-8再进行解码
return new String(bos.toByteArray(), "utf-8");
}
************************************************************************************
public static void main(String[] args) throws Exception {
//定义要读入的hdfs的文件路径
String hdfsFilePath = "/tmp/tian/input.txt";
//将文件从hdfs读取下来,转化成字符串
String result = readFromFile(hdfsFilePath);
//根据题意,将字符串通过命令行输出
System.out.println(result);
}
}
通过javaApi从集群读取文件
最新推荐文章于 2021-02-27 02:04:46 发布