读取HDFS上文件Demo
package com.utils;
import java.io.IOException;
import java.util.List;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.LineReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* 读取Hdfs上的文件
*
* @author chichuduxing
* @date 2016年9月19日 下午14:19:14
*/
public class HdfsReader {
/**
* 日志对象
*/
protected static final Logger logger = LoggerFactory.getLogger(HdfsReader.class);
private FileSystem _fs = null;
/**
* 行读取器
*/
private LineReader _lineReader = null;
/**
* hdfs输入流
*/
private FSDataInputStream _fsInputStream = null;
/**
* 构造函数
*
* @param fs
*/
public HdfsReader(FileSystem fs) {
this._fs = fs;
}
/**
* 初始化
*
* @param file
* 要读取的文件路径(/tmp/readdemo.txt)
* @return
*/
public boolean Init(String file) {
if (null == file || file.isEmpty()) {
logger.error("file name is null");
return false;
}
try {
Path file_path = new Path(file);
if (!_fs.exists(file_path)) {
logger.error(file + " not exist!");
return false;
}
// 打开数据流
this._fsInputStream = this._fs.open(file_path);
this._lineReader = new LineReader(_fsInputStream, _fs.getConf());
return true;
} catch (Exception e) {
logger.error("create line reader failed --" + e.getMessage(), e);
return false;
}
}
/**
* 开始从文件按行读取数据
*
* @param dataList
* 读取到的文件信息
* @param lineCont
* 每次读取的行数
* @return 是否读取成功
*/
public boolean next(List<String> dataList, int lineCont) {
if (null == this._lineReader || null == dataList) {
return false;
}
Text line = new Text();
while (dataList.size() < lineCont) {
try {
// 文件已经读完
if (this._lineReader.readLine(line) <= 0) {
CloseFileStream();
break;
}
} catch (Exception e) {
logger.error("read file failed --" + e.getMessage(), e);
CloseFileStream();
return false;
}
dataList.add(line.toString());
}
logger.info("get data count: " + dataList.size());
return true;
}
/**
* 关闭文件流和行读取器
*/
private void CloseFileStream() {
try {
if (this._fsInputStream != null || this._lineReader != null) {
_fsInputStream.close();
_lineReader.close();
}
} catch (IOException e) {
logger.error("CloseFileStream() failed --" + e.getMessage(), e);
} finally {
_fsInputStream = null;
_lineReader = null;
}
}
}
写文件到HDFS上Demo
package com.utils;
import java.io.IOException;
import java.io.OutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* 写文件到Hdfs上
*
* @author chichuduxing
* @date 2016年9月19日 上午11:19:14
*/
public class HdfsWriter {
/**
* 日志对象
*/
private static final Logger logger = LoggerFactory.getLogger(HdfsWriter.class);
/**
* hdfs 上传目录
*/
private final String _hdfsOutDirectory;
/**
* 临时文件
*/
private Path _tmpFilePath;
/**
* hdfs 文件句柄
*/
private FileSystem _fs = null;
/**
* 写入文件流 句柄
*/
private OutputStream _outputStream = null;
/**
* 是否有写入数据
*/
private boolean _ifWriteData = false;
/**
* @param fs
* @param outPath
* 输出文件路径(/tmp/output/test.txt)
*/
public HdfsWriter(FileSystem fs, String outPath) {
this._hdfsOutDirectory = outPath;
this._fs = fs;
}
/**
* 初始化hdfs写入目录、句柄
*
* @return
*/
public boolean init() throws Exception {
logger.info("HdfsWriter.init() start...");
try {
// 清理临时的数据
InitOutStream();
} catch (Exception e) {
logger.error("HdfsLoader.Init() failed.", e);
return false;
}
logger.info("HdfsWriter.init() done...");
return true;
}
/**
* 初始化输入流
*
* @return
*/
private void InitOutStream() throws Exception {
try {
// 生成临时文件
String tmp_file_name = this._hdfsOutDirectory + ".tmp";
this._tmpFilePath = new Path(tmp_file_name);
if (this._fs.exists(this._tmpFilePath)) {
this._fs.delete(this._tmpFilePath, true);
}
if (!this._fs.createNewFile(this._tmpFilePath)) {
throw new Exception("create tmp hdfs file failed. --" + this._tmpFilePath);
}
logger.info("create hdfs tmp file success: " + tmp_file_name);
// 获取文件流句柄
this._outputStream = this._fs.append(this._tmpFilePath);
} catch (Exception e) {
this._outputStream = null;
throw new Exception("HdfsWriter.InitOutStream() failed.", e);
}
}
/**
* 写数据至hdfs
*
* @param data
*/
public void WriteData(String data) {
if (null == data || data.isEmpty() || null == this._outputStream)
return;
// String 转为 bytes
byte[] bcp_bytes = data.toString().getBytes();
// 写bcp流数据至hdfs
if (0 < bcp_bytes.length) {
try {
this._outputStream.write(bcp_bytes);
this._outputStream.flush();
logger.info("upload bcp data success. --" + this._tmpFilePath.toString());
this._ifWriteData = true;
} catch (Exception ex) {
logger.error("##hdfs write error##: " + ex.getMessage(), ex);
// 转移临时文件
CloseFileStream();
try {
// 重置写入流
InitOutStream();
} catch (Exception e) {
logger.error(e.getMessage());
}
}
}
}
/**
* 关闭文件写入流
*/
public void CloseFileStream() {
try {
// 关闭文件流
if (null != this._outputStream) {
logger.info("close the hdfs file stream.");
this._outputStream.close();
this._outputStream = null;
}
} catch (Exception e) {
logger.error("close the hdfs file stream failed,file is:" + this._tmpFilePath, e);
} finally {
this._outputStream = null;
}
// 转移临时文件
RenameFile();
}
/**
* 重命名文件
*/
private void RenameFile() {
// 是否有写入数据,防止出现空文件
if (!this._ifWriteData)
return;
// 重命名文件名
Path res_path = new Path(this._hdfsOutDirectory);
try {
if (this._fs.exists(res_path)) {
this._fs.delete(res_path, true);
}
if (!this._fs.rename(this._tmpFilePath, res_path)) {
logger.error("rename [" + this._tmpFilePath + "] to [" + res_path + "] failed.");
return;
}
} catch (IOException e) {
logger.error("rename the hdfs file failed,file is:" + this._tmpFilePath, e);
}
logger.info("rename [" + this._tmpFilePath + "] to [" + res_path + "] ok.");
}
}