FileSystem api:http://hadoop.apache.org/docs/current/api/org/apache/hadoop/fs/FileSystem.html
FileStatus 类:https://blog.csdn.net/qq_40794973/article/details/88064201
HDFS的API操作:https://blog.csdn.net/qq_40794973/article/details/86713917#t11
LocatedFileStatus 是 FileStatus 的子类
import org.apache.hadoop.fs.FileStatus;
常用方法举例
listFiles() 方法:
/**
* List the statuses and block locations of the files in the given path.
*
* If the path is a directory,
* if recursive is false, returns files in the directory;
* if recursive is true, return files in the subtree rooted at the path.
* If the path is a file, return the file's status and block locations.
*
* @param f is the path
* @param recursive if the subdirectories need to be traversed recursively
*
* @return an iterator that traverses statuses of the files
*
* @throws FileNotFoundException when the path does not exist;
* IOException see specific implementation
*/
public RemoteIterator<LocatedFileStatus> listFiles(final Path f, final boolean recursive)throws FileNotFoundException, IOException {......省略...}
import org.junit.Test;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
/**
* 获取 FileSystem 对象
*/
public FileSystem getFileSystem(String uri, String user) throws IOException, InterruptedException, URISyntaxException {
Configuration configuration = new Configuration();
return FileSystem.get(new URI(uri), configuration, user);
}
/**
* 遍历所有的文件
* 文件详情查看
*/
@Test
public void test8() throws IOException, InterruptedException, URISyntaxException {
// 1 获取文件系统
FileSystem fs = getFileSystem("hdfs://hadoop102:9000", "atguigu");
//2.获取文件详情
//RemoteIterator 集合上的迭代器,其元素需要远程获取
RemoteIterator<LocatedFileStatus> listFiles = fs.listFiles(new Path("/"), true);//第二个参数为 true 表示递归遍历
while(listFiles.hasNext()){
LocatedFileStatus status = listFiles.next();//LocatedFileStatus 的父类是 FileStatus
// 输出详情
// 文件名称
System.out.println("文件名称: "+status.getPath().getName());
// 路径
System.out.println("路径: "+status.getPath());
// 长度
System.out.println("长度: "+status.getLen());
// 权限
System.out.println("权限: "+status.getPermission());
// 分组
System.out.println("分组: "+status.getGroup());
// 获取存储的块信息
BlockLocation[] blockLocations = status.getBlockLocations();
for (BlockLocation blockLocation : blockLocations) {
// 获取块存储的主机节点
String[] hosts = blockLocation.getHosts();
for (String host : hosts) {
System.out.println("存储的主机节点: "+host);
}
}
System.out.println("------------------------------------------------------------");
}
//3.关闭资源
fs.close();
}
输出:
文件名称: a.txt
路径: hdfs://hadoop102:9000/a.txt
长度: 25
权限: rw-rw-rw-
分组: atguigu
存储的主机节点: hadoop103
存储的主机节点: hadoop102
存储的主机节点: hadoop104
------------------------------------------------------------
listStatus() 方法:
public FileStatus[] listStatus(Path f, PathFilter filter) throws FileNotFoundException, IOException
public FileStatus[] listStatus(Path[] files) throws FileNotFoundException, IOException
public FileStatus[] listStatus(Path[] files, PathFilter filter) throws FileNotFoundException, IOException
public abstract FileStatus[] listStatus(Path f) throws FileNotFoundException, IOException;
private void listStatus(ArrayList<FileStatus> results, Path f,PathFilter filter) throws FileNotFoundException, IOException
import org.junit.Test;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.sql.Timestamp;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
/**
* 获取 FileSystem 对象
*/
public FileSystem getFileSystem(String uri, String user) throws IOException, InterruptedException, URISyntaxException {
Configuration configuration = new Configuration();
return FileSystem.get(new URI(uri), configuration, user);
}
/**
* 递归遍历 hdfs 文件系统
*filestatus 获取文件状态
*/
@Test
public void test9() throws IOException, InterruptedException, URISyntaxException {
// 1 获取文件系统
FileSystem fs = getFileSystem("hdfs://hadoop102:9000", "atguigu");
Path path = new Path("/");
listFilesStatus(path,fs);
//3.关闭资源
fs.close();
}
public void listFilesStatus(Path path, FileSystem hdfs) throws IOException, InterruptedException, URISyntaxException {
FileStatus[] files = hdfs.listStatus(path);
for (int i = 0; i <files.length ; i++) {//循环遍历文件
FileStatus file = files[i];
if(file.isFile()){
System.out.println("-----------------这是文件--------------------------");
//----------------------------------------------------------------
long len = file.getLen(); //文件长度
String pathSource = file.getPath().toString();//文件路径
String fileName = file.getPath().getName(); // 文件名称
String parentPath = file.getPath().getParent().toString();//文件父路径
Timestamp timestamp = new Timestamp(file.getModificationTime());//文件最后修改时间
long blockSize = file.getBlockSize(); //文件块大小
String group = file.getGroup(); //文件所属组
String owner = file.getOwner(); // 文件拥有者
long accessTime = file.getAccessTime(); //该文件上次访问时间
short replication = file.getReplication(); //文件副本数
//----------------------------------------------------------------
System.out.println("文件长度: "+len+"\n"+
"文件路径: "+pathSource+"\n"+
"文件名称: "+fileName+"\n"+
"文件父路径: "+parentPath+"\n"+
"文件最后修改时间: "+timestamp+"\n"+
"文件块大小: "+blockSize+"\n"+
"文件所属组: "+group+"\n"+
"文件拥有者: "+owner+"\n"+
"该文件上次访问时间: "+accessTime+"\n"+
"文件副本数: "+replication+"\n"+
"-------------------------------------------");
}else if(file.isDirectory()){//目录
System.out.println("--这是文件夹--");
System.out.println("文件父路径: "+file.getPath().toString());
System.out.println("-----------");
//递归调用
listFilesStatus(file.getPath(),hdfs);
}else if(file.isSymlink()){
System.out.println("这是链接文件");
}
}
}
-----------------这是文件--------------------------
文件长度: 444
文件路径: hdfs://hadoop102:9000/hehe.txt
文件名称: hehe.txt
文件父路径: hdfs://hadoop102:9000/
文件最后修改时间: 2019-03-01 04:48:48.28
文件块大小: 134217728
文件所属组: supergroup
文件拥有者: atguigu
该文件上次访问时间: 1551386928141
文件副本数: 2
-------------------------------------------
--这是文件夹--
文件父路径: hdfs://hadoop102:9000/test
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.junit.Test;
import java.io.IOException;
import java.net.URISyntaxException;
@Test
public void test() throws IOException, InterruptedException {
Configuration conf = new Configuration();
// 配置在集群上运行
//1 获取hdfs客户端对象
FileSystem fs = FileSystem.get(URI.create("hdfs://hadoop102:9000"),conf,"atguigu");
FileStatus[] fileStatuses = fs.listStatus(new Path("/test.txt"));
Path[] paths = FileUtil.stat2Paths(fileStatuses);//把FileStatus对象数组转换为一个Path数组
// for (FileStatus status: fileStatuses) {
// System.out.println(status.getPath().getName());
// }System.out.println("--------------");
for (Path path: paths) {
System.out.println(path);
}
}
import org.apache.hadoop.fs.FileUtil;
注:Hadoop 的 FileUtil 中 stat2Paths()方法用于把一个FileStatus对象数组转换为一个Path数组。
PathFilter 用户过滤
@Test
public void test() throws IOException {
Configuration conf = new Configuration();
// 配置在集群上运行
//1 获取hdfs客户端对象
FileSystem fs = FileSystem.get(URI.create("hdfs://hadoop102:9000"),conf,"atguigu");
//过滤出后缀为 .txt 的文件
/*
FileStatus[] fileStatuses = fs.listStatus(new Path("/"), new PathFilter() {
@Override
public boolean accept(Path path) {
return path.getName().endsWith(".txt");
}
});
*/
FileStatus[] fileStatuses = fs.listStatus(new Path("/"), path -> path.getName().endsWith(".txt"));
Path[] paths = FileUtil.stat2Paths(fileStatuses);//把FileStatus对象数组转换为一个Path数组
for (Path path : paths) {
System.out.println(path);
}
}
globStatus() 方法:(用于过滤)
这里的 Path 里面是可以写通配符的,比如 Path path = new Path("/*");
通配符及其含义 * 匹配0到多个字符 ? 匹配单一字符 [ab] 匹配{a,b}集合中的一个字符 [^ab] 匹配非{a,b}集合里的一个字符 [a-b] 匹配在{a,b}范围内的一个字符 [^a-b] 匹配非{a,b}范围内的一个字符 {a,b} 匹配包含a或b中的一个字符 \c 匹配元字符c
找出 / 目录下所有以 .txt 结尾的文件和目录,并且文件名包含 a
import org.junit.Test;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
/**
* 获取 FileSystem 对象
*/
public FileSystem getFileSystem(String uri, String user) throws IOException, InterruptedException, URISyntaxException {
Configuration configuration = new Configuration();
return FileSystem.get(new URI(uri), configuration, user);
}
/**
* globStatus 路径过滤
* 我们需要得到和 / 同级中的带有hehe的路径 ,或者文件
* globStatus 很灵活,内部甚至可以写一些正则表达式,有时候在处理大数据的预处理的时候可能很有效
*/
@Test
public void test10() throws IOException, InterruptedException, URISyntaxException {
// 1 获取文件系统
FileSystem hdfs = getFileSystem("hdfs://hadoop102:9000", "atguigu");
Path path = new Path("/*");//正则
// FileStatus[] fileGlobStatuses = hdfs.globStatus(path,new PathFilter() {
// @Override
// public boolean accept(Path x) {
// // 过滤出路径中包含 hehe字符串 的路径
// return x.toString().contains("hehe");
// }
// });
FileStatus[] fileGlobStatuses = hdfs.globStatus(path,(x)->x.getName().contains("a"));
if(fileGlobStatuses != null) {//不为空就输出
Path[] globPaths = FileUtil.stat2Paths(fileGlobStatuses);
for (Path p :globPaths){
System.out.println("globe过滤后的路径"+p);
}
}else {
System.out.println("没有找到对应的目录或者文件");
}
}
globe过滤后的路径hdfs://hadoop102:9000/hehe
globe过滤后的路径hdfs://hadoop102:9000/hehe.txt
找出目录树前三层里面后缀为 .txt 并且文件名以a开头的文件和目录。
import org.junit.Test;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@Test
public void test11() throws IOException, InterruptedException, URISyntaxException {
Configuration conf = new Configuration();
// 1 获取文件系统
FileSystem hdfs = FileSystem.get(URI.create("hdfs://hadoop102:9000"),conf, "atguigu");
Path path = new Path("/*/*/*/a*.txt");//正则表达式
FileStatus[] status = hdfs.globStatus(path);
if(status != null) {//不为空就输出
for(FileStatus p:status){
if (hdfs.exists(p.getPath())) {
System.out.println(p.getPath());
}
}
}else {
System.out.println("没有找到对应的a*.txt文件");
}
}
hdfs://hadoop102:9000/a.txt
hdfs://hadoop102:9000/aa.txt
hdfs://hadoop102:9000/aaaaa.txt
hdfs://hadoop102:9000/aab.txt
打印出前四层所有的文件和目录
@Test
public void test11() throws IOException, InterruptedException, URISyntaxException {
Configuration conf = new Configuration();
// 1 获取文件系统
FileSystem hdfs = FileSystem.get(URI.create("hdfs://hadoop102:9000"),conf, "atguigu");
Path path = new Path("/*/*/*/*");//正则表达式
FileStatus[] status = hdfs.globStatus(path);
if(status != null) {//不为空就输出
for(FileStatus p:status){
if (hdfs.exists(p.getPath())) {
System.out.println(p.getPath());
}
}
}else {
System.out.println("没有文件");
}
}
注:前面代码里面的else永远执行不到,及时文件没有他也不为null。
// hdfs自身提供了许多filter,在hadoop权威指南中,提供一种 正则表达式filter的实现
public class RegexExcludePathFilter implements PathFilter {
private String regex;
public RegexExcludePathFilter(String regex) {
this.regex = regex;
}
@Override
public boolean accept(Path path) {
return !path.toString().matches(regex);
}
}