使用java操作hadoop
1.读取HDFS文件加工后写入新文件
2.解压多级tgz文件,并写入指定目录
package com.cupdata.dataConversion.utils;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.compress.archivers.ArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.springframework.stereotype.Component;
import java.io.*;
import java.net.URI;
import java.net.URISyntaxException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.io.IOUtils;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.web.multipart.MultipartFile;
import javax.annotation.PostConstruct;
@Slf4j
@Component
public class HdfsUtils {
@Value("${hdfs.path}")
private String path;
@Value("${hdfs.username}")
private String username;
private static String hdfsPath;
private static String hdfsName;
private static final int bufferSize = 1024 * 1024 * 64;
//缓冲区大小
public static final int TA_BUFFER_SIZE = 1024 * 2;
/**
* 获取HDFS配置信息
* @return
*/
private static Configuration getConfiguration() {
Configuration configuration = new Configuration();
configuration.set("fs.defaultFS", hdfsPath);
return configuration;
}
/**
* 获取HDFS文件系统对象
* @return
* @throws Exception
*/
public static FileSystem getFileSystem() throws
URISyntaxException,InterruptedException, IOException {
// 客户端去操作hdfs时是有一个用户身份的,默认情况下hdfs客户端api会从jvm中获取一个参数作为自己的用户身份
// DHADOOP_USER_NAME=hadoop
// 也可以在构造客户端fs对象时,通过参数传递进去
FileSystem fileSystem = FileSystem.get(new URI(hdfsPath), getConfiguration(), hdfsName);
return fileSystem;
}
/**
* 在HDFS创建文件夹
* @param path
* @return
* @throws Exception
*/
public static boolean mkdir(String path) throws InterruptedException, IOException, URISyntaxException {
if (StringUtils.isEmpty(path)) {
return false;
}
if (existFile(path)) {
return true;
}
FileSystem fs = getFileSystem();
// 目标路径
Path srcPath = new Path(path);
boolean isOk = fs.mkdirs(srcPath);
fs.close();
return isOk;
}
/**
* 判断HDFS文件是否存在
* @param path
* @return
* @throws Exception
*/
public static boolean existFile(String path) throws InterruptedException, IOException, URISyntaxException {
if (StringUtils.isEmpty(path)) {
return false;
}
FileSystem fs = getFileSystem();
Path srcPath = new Path(path);
boolean isExists = fs.exists(srcPath);
return isExists;
}
/**
* 判断是否是文件夹
* @param path
* @return
* @throws InterruptedException
* @throws IOException
* @throws URISyntaxException
*/
public static boolean isDirectory(String path) throws InterruptedException, IOException, URISyntaxException {
if (StringUtils.isEmpty(path)) {
return false;
}
FileSystem fs = getFileSystem();
Path srcPath = new Path(path);
boolean isExists = fs.getFileStatus(srcPath).isDirectory();
return isExists;
}
/**
* 读取HDFS目录信息
* @param path
* @return
* @throws Exception
*/
public static List<Map<String, Object>> readPathInfo(String path) throws Exception {
if (StringUtils.isEmpty(path)) {
return null;
}
if (!existFile(path)) {
return null;
}
FileSystem fs = getFileSystem();
// 目标路径
Path newPath = new Path(path);
FileStatus[] statusList = fs.listStatus(newPath);
List<Map<String, Object>> list = new ArrayList<>();
if (null != statusList && statusList.length > 0) {
for (FileStatus fileStatus : statusList) {
Map<String, Object> map = new HashMap<>();
map.put("filePath", fileStatus.getPath());
map.put("fileStatus", fileStatus.toString());
list.add(map);
}
return list;
} else {
return null;
}
}
/**
* HDFS创建文件
* @param path
* @param file
* @throws Exception
*/
public static void createFile(String path, MultipartFile file) throws Exception {
if (StringUtils.isEmpty(path) || null == file.getBytes()) {
return;
}
String fileName = file.getOriginalFilename();
FileSystem fs = getFileSystem();
// 上传时默认当前目录,后面自动拼接文件的目录
Path newPath = new Path(path + "/" + fileName);
// 打开一个输出流
FSDataOutputStream outputStream = fs.create(newPath);
outputStream.write(file.getBytes());
outputStream.close();
fs.close();
}
/**
* 读取HDFS文件内容
* @param path
* @return
* @throws Exception
*/
public static String readFile(String path) throws Exception {
if (StringUtils.isEmpty(path)) {
return null;
}
if (!existFile(path)) {
return null;
}
FileSystem fs = getFileSystem();
// 目标路径
Path srcPath = new Path(path);
FSDataInputStream inputStream = null;
try {
inputStream = fs.open(srcPath);
// 防止中文乱码
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
String lineTxt = "";
StringBuilder sb = new StringBuilder();
while ((lineTxt = reader.readLine()) != null) {
sb.append(lineTxt);
}
return sb.toString();
} finally {
inputStream.close();
fs.close();
}
}
/**
* 读取HDFS文件加工后写入新文件
* 支持.txt 和 .csv文件
* @param sourcePath 源文件
* @param destinationPath 目标文件
* @return
* @throws Exception
*/
public static String readAndWriteFile(String sourcePath,String destinationPath)
throws InterruptedException, IOException, URISyntaxException {
//dw_date
String dw_date = null ;
String firstName = null ;
if(sourcePath.endsWith(".txt")){
firstName = sourcePath.split(".txt")[0];
}else if(sourcePath.endsWith(".csv")){
firstName = sourcePath.split(".csv")[0];
}
if(firstName != null){
int len = firstName.length();
dw_date = firstName.substring(len-8);
}else{
log.error("截取文件名获取firstName:{}",firstName);
return "截取文件名获取dw_date失败";
}
if(dw_date != null && dw_date.length() == 8 ){
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyyMMdd");
try{
simpleDateFormat.parse(dw_date);
}catch (ParseException e){
log.error("dw_date:{}时间格式转换异常",dw_date,e);
return "dw_date不符合时间格式" ;
}
}
FileSystem fs = getFileSystem();
// 源路径
Path srcPath = new Path(sourcePath);
//目标路径
Path newPath = new Path( destinationPath);
FSDataInputStream inputStream = null;
FSDataOutputStream outputStream = null;
BufferedReader reader = null ;
BufferedWriter write = null ;
try {
inputStream = fs.open(srcPath);
outputStream = fs.create(newPath);
// 防止中文乱码
reader = new BufferedReader(new InputStreamReader(inputStream),1024 * 200);
write = new BufferedWriter(new OutputStreamWriter(outputStream),1024 * 200);
String lineTxt = "";
while ((lineTxt = reader.readLine()) != null) {
StringBuilder stringBuilder = new StringBuilder();
stringBuilder.append(lineTxt);
stringBuilder.append(dw_date);
stringBuilder.append("|");
write.write(stringBuilder.toString());
write.newLine();
write.flush();
}
return "文件转换成功";
} finally {
inputStream.close();
outputStream.close();
reader.close();
write.close();
fs.close();
}
}
/**
* 解压多级tgz文件,并写入指定目录
* @param tgzFilePath
* @param destDir
* @throws InterruptedException
* @throws IOException
* @throws URISyntaxException
*/
public static void decompression(String tgzFilePath,String destDir) throws InterruptedException, IOException, URISyntaxException {
FileSystem fs = getFileSystem();
Path inputPath = new Path(tgzFilePath);
InputStream inputStream = null;
OutputStream outputStream = null;
GzipCompressorInputStream gzipCompressorInputStream = null ;
TarArchiveInputStream tarIn = null ;
try {
inputStream = fs.open(inputPath);
gzipCompressorInputStream = new GzipCompressorInputStream(inputStream);
tarIn = new TarArchiveInputStream(gzipCompressorInputStream, TA_BUFFER_SIZE);
TarArchiveEntry entry = null;
while ((entry = tarIn.getNextTarEntry()) != null) {
String outputUri = destDir + File.separator + entry.getName();
//是目录
if (entry.isDirectory()) {
mkdir(outputUri);
} else {
outputStream = fs.create(new Path(outputUri));
IOUtils.copyBytes(tarIn, outputStream, bufferSize,false);
//flush写入文件,关闭流
outputStream.flush();
IOUtils.closeStream(outputStream);
}
}
} finally {
IOUtils.closeStream(inputStream);
IOUtils.closeStream(gzipCompressorInputStream);
IOUtils.closeStream(tarIn);
IOUtils.closeStream(outputStream);
}
}
/**
* 读取HDFS文件列表
* @param path
* @return
* @throws Exception
*/
public static List<Map<String, String>> listFile(String path) throws Exception {
if (StringUtils.isEmpty(path)) {
return null;
}
if (!existFile(path)) {
return null;
}
FileSystem fs = getFileSystem();
// 目标路径
Path srcPath = new Path(path);
// 递归找到所有文件
RemoteIterator<LocatedFileStatus> filesList = fs.listFiles(srcPath, true);
List<Map<String, String>> returnList = new ArrayList<>();
while (filesList.hasNext()) {
LocatedFileStatus next = filesList.next();
String fileName = next.getPath().getName();
Path filePath = next.getPath();
Map<String, String> map = new HashMap<>();
map.put("fileName", fileName);
map.put("filePath", filePath.toString());
returnList.add(map);
}
fs.close();
return returnList;
}
/**
* HDFS重命名文件
* @param oldName
* @param newName
* @return
* @throws Exception
*/
public static boolean renameFile(String oldName, String newName) throws Exception {
if (StringUtils.isEmpty(oldName) || StringUtils.isEmpty(newName)) {
return false;
}
FileSystem fs = getFileSystem();
// 原文件目标路径
Path oldPath = new Path(oldName);
// 重命名目标路径
Path newPath = new Path(newName);
boolean isOk = fs.rename(oldPath, newPath);
fs.close();
return isOk;
}
/**
* 删除HDFS文件
* @param path
* @return
* @throws Exception
*/
public static boolean deleteFile(String path) throws Exception {
if (StringUtils.isEmpty(path)) {
return false;
}
if (!existFile(path)) {
return false;
}
FileSystem fs = getFileSystem();
Path srcPath = new Path(path);
boolean isOk = fs.deleteOnExit(srcPath);
fs.close();
return isOk;
}
/**
* 上传HDFS文件
* @param path
* @param uploadPath
* @throws Exception
*/
public static void uploadFile(String path, String uploadPath) throws Exception {
if (StringUtils.isEmpty(path) || StringUtils.isEmpty(uploadPath)) {
return;
}
FileSystem fs = getFileSystem();
// 上传路径
Path clientPath = new Path(path);
// 目标路径
Path serverPath = new Path(uploadPath);
// 调用文件系统的文件复制方法,第一个参数是否删除原文件true为删除,默认为false
fs.copyFromLocalFile(false, clientPath, serverPath);
fs.close();
}
/**
* 下载HDFS文件
* @param path
* @param downloadPath
* @throws Exception
*/
public static void downloadFile(String path, String downloadPath) throws Exception {
if (StringUtils.isEmpty(path) || StringUtils.isEmpty(downloadPath)) {
return;
}
FileSystem fs = getFileSystem();
// 上传路径
Path clientPath = new Path(path);
// 目标路径
Path serverPath = new Path(downloadPath);
// 调用文件系统的文件复制方法,第一个参数是否删除原文件true为删除,默认为false
fs.copyToLocalFile(false, clientPath, serverPath);
fs.close();
}
/**
* HDFS文件复制
* @param sourcePath
* @param targetPath
* @throws Exception
*/
public static void copyFile(String sourcePath, String targetPath) throws Exception {
if (StringUtils.isEmpty(sourcePath) || StringUtils.isEmpty(targetPath)) {
return;
}
FileSystem fs = getFileSystem();
// 原始文件路径
Path oldPath = new Path(sourcePath);
// 目标路径
Path newPath = new Path(targetPath);
FSDataInputStream inputStream = null;
FSDataOutputStream outputStream = null;
try {
inputStream = fs.open(oldPath);
outputStream = fs.create(newPath);
IOUtils.copyBytes(inputStream, outputStream, bufferSize, false);
} finally {
inputStream.close();
outputStream.close();
fs.close();
}
}
/**
* 打开HDFS上的文件并返回byte数组
* @param path
* @return
* @throws Exception
*/
public static byte[] openFileToBytes(String path) throws Exception {
if (StringUtils.isEmpty(path)) {
return null;
}
if (!existFile(path)) {
return null;
}
FileSystem fs = getFileSystem();
// 目标路径
Path srcPath = new Path(path);
try {
FSDataInputStream inputStream = fs.open(srcPath);
return IOUtils.readFullyToByteArray(inputStream);
} finally {
fs.close();
}
}
/**
* 获取某个文件在HDFS的集群位置
* @param path
* @return
* @throws Exception
*/
public static BlockLocation[] getFileBlockLocations(String path) throws Exception {
if (StringUtils.isEmpty(path)) {
return null;
}
if (!existFile(path)) {
return null;
}
FileSystem fs = getFileSystem();
// 目标路径
Path srcPath = new Path(path);
FileStatus fileStatus = fs.getFileStatus(srcPath);
return fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
}
@PostConstruct
public void getPath() {
hdfsPath = this.path;
}
@PostConstruct
public void getName() {
hdfsName = this.username;
}
public static String getHdfsPath() {
return hdfsPath;
}
public String getUsername() {
return username;
}
}
其他方法选自网上公共资源,文件多级解压和文件内容加工为原创。