HDFS_FileSystem 类介绍

最新推荐文章于 2022-04-19 23:27:46 发布

爪哇

最新推荐文章于 2022-04-19 23:27:46 发布

阅读量1.2w

点赞数 5

分类专栏：大数据大数据_API Big data

记录平时的问题，完全是给自己看的，垃圾博客，浪费各位大佬时间...

本文链接：https://blog.csdn.net/qq_40794973/article/details/88067088

版权

大数据同时被 3 个专栏收录

22 篇文章 3 订阅

订阅专栏

Big data

13 篇文章 0 订阅

订阅专栏

大数据_API

4 篇文章 0 订阅

订阅专栏

FileSystem api:http://hadoop.apache.org/docs/current/api/org/apache/hadoop/fs/FileSystem.html

FileStatus 类：https://blog.csdn.net/qq_40794973/article/details/88064201

HDFS的API操作：https://blog.csdn.net/qq_40794973/article/details/86713917#t11

LocatedFileStatus 是 FileStatus 的子类

import org.apache.hadoop.fs.FileStatus;

常用方法举例

listFiles() 方法：

 /**
   * List the statuses and block locations of the files in the given path.
   * 
   * If the path is a directory, 
   *   if recursive is false, returns files in the directory;
   *   if recursive is true, return files in the subtree rooted at the path.
   * If the path is a file, return the file's status and block locations.
   * 
   * @param f is the path
   * @param recursive if the subdirectories need to be traversed recursively
   *
   * @return an iterator that traverses statuses of the files
   *
   * @throws FileNotFoundException when the path does not exist;
   *         IOException see specific implementation
   */
    public RemoteIterator<LocatedFileStatus> listFiles(final Path f, final boolean recursive)throws FileNotFoundException, IOException {......省略...}

import org.junit.Test;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;

/**
 * 获取 FileSystem 对象
 */
public FileSystem getFileSystem(String uri, String user) throws IOException, InterruptedException, URISyntaxException {
	Configuration configuration = new Configuration();
	return FileSystem.get(new URI(uri), configuration, user);
}
/**
 * 遍历所有的文件
 * 文件详情查看
 */
@Test
public void test8() throws IOException, InterruptedException, URISyntaxException {
	// 1 获取文件系统
	FileSystem fs = getFileSystem("hdfs://hadoop102:9000", "atguigu");
	//2.获取文件详情
	//RemoteIterator  集合上的迭代器，其元素需要远程获取
	RemoteIterator<LocatedFileStatus> listFiles = fs.listFiles(new Path("/"), true);//第二个参数为 true 表示递归遍历
	while(listFiles.hasNext()){
		LocatedFileStatus status = listFiles.next();//LocatedFileStatus 的父类是 FileStatus  
		// 输出详情
		// 文件名称
		System.out.println("文件名称: "+status.getPath().getName());
		// 路径
		System.out.println("路径: "+status.getPath());
		// 长度
		System.out.println("长度: "+status.getLen());
		// 权限
		System.out.println("权限: "+status.getPermission());
		// 分组
		System.out.println("分组: "+status.getGroup());
		// 获取存储的块信息
		BlockLocation[] blockLocations = status.getBlockLocations();
		for (BlockLocation blockLocation : blockLocations) {
			// 获取块存储的主机节点
			String[] hosts = blockLocation.getHosts();
			for (String host : hosts) {
				System.out.println("存储的主机节点: "+host);
			}
		}
		System.out.println("------------------------------------------------------------");
	}
	//3.关闭资源
	fs.close();
}

输出：

文件名称: a.txt
路径: hdfs://hadoop102:9000/a.txt
长度: 25
权限: rw-rw-rw-
分组: atguigu
存储的主机节点: hadoop103
存储的主机节点: hadoop102
存储的主机节点: hadoop104
------------------------------------------------------------

listStatus() 方法：

public FileStatus[] listStatus(Path f, PathFilter filter) throws FileNotFoundException, IOException
public FileStatus[] listStatus(Path[] files) throws FileNotFoundException, IOException
public FileStatus[] listStatus(Path[] files, PathFilter filter) throws FileNotFoundException, IOException
public abstract FileStatus[] listStatus(Path f) throws FileNotFoundException, IOException;
private void listStatus(ArrayList<FileStatus> results, Path f,PathFilter filter) throws FileNotFoundException, IOException

import org.junit.Test;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.sql.Timestamp;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

/**
 * 获取 FileSystem 对象
 */
public FileSystem getFileSystem(String uri, String user) throws IOException, InterruptedException, URISyntaxException {
	Configuration configuration = new Configuration();
	return FileSystem.get(new URI(uri), configuration, user);
}

/**
 * 递归遍历 hdfs 文件系统
 *filestatus 获取文件状态
 */
@Test
public void test9() throws IOException, InterruptedException, URISyntaxException {
	// 1 获取文件系统
	FileSystem fs = getFileSystem("hdfs://hadoop102:9000", "atguigu");
	Path path = new Path("/");
	listFilesStatus(path,fs);
	//3.关闭资源
	fs.close();
}


public void listFilesStatus(Path path, FileSystem hdfs) throws IOException, InterruptedException, URISyntaxException {
	FileStatus[] files = hdfs.listStatus(path);
	for (int i = 0; i <files.length ; i++) {//循环遍历文件
		FileStatus file = files[i];
		if(file.isFile()){
			System.out.println("-----------------这是文件--------------------------");
			//----------------------------------------------------------------
			long len = file.getLen();                   //文件长度
			String pathSource = file.getPath().toString();//文件路径
			String fileName = file.getPath().getName();   // 文件名称
			String parentPath = file.getPath().getParent().toString();//文件父路径
			Timestamp timestamp = new Timestamp(file.getModificationTime());//文件最后修改时间
			long blockSize = file.getBlockSize();   //文件块大小
			String group = file.getGroup();         //文件所属组
			String owner = file.getOwner();          // 文件拥有者
			long accessTime = file.getAccessTime();  //该文件上次访问时间
			short replication = file.getReplication(); //文件副本数
			//----------------------------------------------------------------
			System.out.println("文件长度: "+len+"\n"+
					"文件路径: "+pathSource+"\n"+
					"文件名称: "+fileName+"\n"+
					"文件父路径: "+parentPath+"\n"+
					"文件最后修改时间: "+timestamp+"\n"+
					"文件块大小: "+blockSize+"\n"+
					"文件所属组: "+group+"\n"+
					"文件拥有者: "+owner+"\n"+
					"该文件上次访问时间: "+accessTime+"\n"+
					"文件副本数: "+replication+"\n"+
					"-------------------------------------------");

		}else if(file.isDirectory()){//目录
			System.out.println("--这是文件夹--");
			System.out.println("文件父路径: "+file.getPath().toString());
			System.out.println("-----------");
			//递归调用
			listFilesStatus(file.getPath(),hdfs);

		}else if(file.isSymlink()){
			System.out.println("这是链接文件");
		}
	}
}

-----------------这是文件--------------------------
文件长度: 444
文件路径: hdfs://hadoop102:9000/hehe.txt
文件名称: hehe.txt
文件父路径: hdfs://hadoop102:9000/
文件最后修改时间: 2019-03-01 04:48:48.28
文件块大小: 134217728
文件所属组: supergroup
文件拥有者: atguigu
该文件上次访问时间: 1551386928141
文件副本数: 2
-------------------------------------------
--这是文件夹--
文件父路径: hdfs://hadoop102:9000/test

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.junit.Test;

import java.io.IOException;
import java.net.URISyntaxException;

@Test
public void test() throws IOException, InterruptedException {
    Configuration conf = new Configuration();
    // 配置在集群上运行
    //1 获取hdfs客户端对象
    FileSystem fs = FileSystem.get(URI.create("hdfs://hadoop102:9000"),conf,"atguigu");
    FileStatus[] fileStatuses = fs.listStatus(new Path("/test.txt"));
    Path[] paths = FileUtil.stat2Paths(fileStatuses);//把FileStatus对象数组转换为一个Path数组
    //    for (FileStatus status: fileStatuses) {
    //        System.out.println(status.getPath().getName());
    //    }System.out.println("--------------");

    for (Path path: paths) {
        System.out.println(path);
    }
}

import org.apache.hadoop.fs.FileUtil;

注：Hadoop 的 FileUtil 中 stat2Paths()方法用于把一个FileStatus对象数组转换为一个Path数组。

PathFilter 用户过滤

@Test
public void test() throws IOException {
    Configuration conf = new Configuration();
    // 配置在集群上运行
    //1 获取hdfs客户端对象
    FileSystem fs = FileSystem.get(URI.create("hdfs://hadoop102:9000"),conf,"atguigu");
    //过滤出后缀为 .txt 的文件
    /*
    FileStatus[] fileStatuses = fs.listStatus(new Path("/"), new PathFilter() {
        @Override
        public boolean accept(Path path) {
           return path.getName().endsWith(".txt");
        }
    });
    */
    FileStatus[] fileStatuses = fs.listStatus(new Path("/"), path -> path.getName().endsWith(".txt"));
    Path[] paths = FileUtil.stat2Paths(fileStatuses);//把FileStatus对象数组转换为一个Path数组
    for (Path path : paths) {
        System.out.println(path);
    }
}

globStatus() 方法：（用于过滤）

这里的 Path 里面是可以写通配符的，比如 Path path = new Path("/*");

通配符及其含义
* 匹配0到多个字符
？匹配单一字符
[ab] 匹配{a，b}集合中的一个字符
[^ab] 匹配非{a，b}集合里的一个字符
[a-b] 匹配在{a，b}范围内的一个字符
[^a-b] 匹配非{a，b}范围内的一个字符
{a，b} 匹配包含a或b中的一个字符
\c 匹配元字符c

通配符及其含义
*	匹配0到多个字符
？	匹配单一字符
[ab]	匹配{a，b}集合中的一个字符
[^ab]	匹配非{a，b}集合里的一个字符
[a-b]	匹配在{a，b}范围内的一个字符
[^a-b]	匹配非{a，b}范围内的一个字符
{a，b}	匹配包含a或b中的一个字符
\c	匹配元字符c

找出 / 目录下所有以 .txt 结尾的文件和目录，并且文件名包含 a

import org.junit.Test;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;

/**
 * 获取 FileSystem 对象
 */
public FileSystem getFileSystem(String uri, String user) throws IOException, InterruptedException, URISyntaxException {
	Configuration configuration = new Configuration();
	return FileSystem.get(new URI(uri), configuration, user);
}
/**
 * globStatus 路径过滤
 *  我们需要得到和 / 同级中的带有hehe的路径 ，或者文件
 * globStatus 很灵活，内部甚至可以写一些正则表达式，有时候在处理大数据的预处理的时候可能很有效
 */
@Test
public void test10() throws IOException, InterruptedException, URISyntaxException {
	// 1 获取文件系统
	FileSystem hdfs = getFileSystem("hdfs://hadoop102:9000", "atguigu");
	Path path = new Path("/*");//正则

//	FileStatus[] fileGlobStatuses = hdfs.globStatus(path,new PathFilter() {
//		@Override
//		public boolean accept(Path x) {
//			// 过滤出路径中包含 hehe字符串 的路径
//			return  x.toString().contains("hehe");
//		}
//	});
	FileStatus[] fileGlobStatuses = hdfs.globStatus(path,(x)->x.getName().contains("a"));
	if(fileGlobStatuses != null) {//不为空就输出
		Path[] globPaths = FileUtil.stat2Paths(fileGlobStatuses);
		for (Path p :globPaths){
			System.out.println("globe过滤后的路径"+p);
		}
	}else {
		System.out.println("没有找到对应的目录或者文件");
	}
}

globe过滤后的路径hdfs://hadoop102:9000/hehe
globe过滤后的路径hdfs://hadoop102:9000/hehe.txt

找出目录树前三层里面后缀为 .txt 并且文件名以a开头的文件和目录。

import org.junit.Test;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

@Test
public void test11() throws IOException, InterruptedException, URISyntaxException {
    Configuration conf = new Configuration();
    // 1 获取文件系统
    FileSystem hdfs = FileSystem.get(URI.create("hdfs://hadoop102:9000"),conf, "atguigu");
    Path path = new Path("/*/*/*/a*.txt");//正则表达式

    FileStatus[] status =  hdfs.globStatus(path);
    if(status != null) {//不为空就输出
        for(FileStatus p:status){
            if (hdfs.exists(p.getPath())) {
                System.out.println(p.getPath());
            }
        }
    }else {
        System.out.println("没有找到对应的a*.txt文件");
    }
}

hdfs://hadoop102:9000/a.txt
hdfs://hadoop102:9000/aa.txt
hdfs://hadoop102:9000/aaaaa.txt
hdfs://hadoop102:9000/aab.txt

打印出前四层所有的文件和目录

@Test
public void test11() throws IOException, InterruptedException, URISyntaxException {
    Configuration conf = new Configuration();
    // 1 获取文件系统
    FileSystem hdfs = FileSystem.get(URI.create("hdfs://hadoop102:9000"),conf, "atguigu");
    Path path = new Path("/*/*/*/*");//正则表达式

    FileStatus[] status =  hdfs.globStatus(path);
    if(status != null) {//不为空就输出
        for(FileStatus p:status){
            if (hdfs.exists(p.getPath())) {
                System.out.println(p.getPath());
            }
        }
    }else {
        System.out.println("没有文件");
    }
}

注：前面代码里面的else永远执行不到，及时文件没有他也不为null。

//	hdfs自身提供了许多filter，在hadoop权威指南中，提供一种 正则表达式filter的实现
public class RegexExcludePathFilter implements PathFilter {
    private  String regex;
    public RegexExcludePathFilter(String regex) {
        this.regex = regex;
    }
    @Override
    public boolean accept(Path path) {
        return !path.toString().matches(regex);
    }
}