Hadoop --- 入门之HDFS的JAVA API操作

最新推荐文章于 2024-07-25 16:52:43 发布

__静禅__

最新推荐文章于 2024-07-25 16:52:43 发布

阅读量622

点赞数

分类专栏：大数据 ------ Hadoop

本文链接：https://blog.csdn.net/ka_ka314/article/details/83059006

版权

大数据同时被 2 个专栏收录

6 篇文章 0 订阅

订阅专栏

------ Hadoop

6 篇文章 0 订阅

订阅专栏

JAR准备：

将hadoop-2.8.0中share目录下的jar包添加到工程中：

common下的hadoop-common-2.8.0.jar
common/lib下的所有jar
hdfs下的hadoop-hdfs-2.8.0.jar
hdfs/lib下的所有jar

示例：

import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.net.URI;
import java.util.Iterator;
import java.util.Map.Entry;

import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.junit.Before;
import org.junit.Test;

//客户端操作hdfs，是有一个用户身份的
//默认情况下，hdfs客户端api会从jvm中获取一个参数来作为自己的用户身份：-DHADOOP_USER_NAME=hadoop
public class HdfsClientDemo {

	private FileSystem fs;
	private Configuration conf;

	@Before
	public void init() throws Exception{
		
		conf = new Configuration();
		conf.set("dfs.replication", "2");

		fs = FileSystem.get(new URI("hdfs://192.168.153.136:9000"),conf,"hadoop");
	}
	
	// 获取conf配置参数
	@Test
	public void testConf(){
		Iterator<Entry<String, String>> iterator = conf.iterator();
		while (iterator.hasNext()) {
			Entry<String, String> ent = iterator.next();
			System.out.println(ent.getKey() + ": " + ent.getValue());
		}
	}
	
	// 上传文件到HDFS文件系统
	@Test
	public void testUpload() throws Exception {
		// （1）该方式是通过IO流的方式上传文件到HDFS文件系统
		//FileInputStream fileInputStream =new FileInputStream("e:/Spring MVC.docx");
		//FSDataOutputStream fsDataOutputStream = fs.create(new Path("/Spring MVC222.docx"));
		//IOUtils.copy(fileInputStream, fsDataOutputStream);
		
		// （2）方式二
		boolean windowsAbsolutePath = Path.isWindowsAbsolutePath("e:/Spring MVC.docx", true);
		System.out.println(windowsAbsolutePath);
		if (windowsAbsolutePath) {
			fs.copyFromLocalFile(new Path("e:/Spring MVC.docx"), new Path("/Spring MVC.docx"));
			fs.close();
		}
	}
	
	// 从HDFS文件系统下载文件到本地
	@Test
	public void testDownload() throws Exception {
		//（1）该方式不依赖于本地hadoop环境，直接通过IO流
		//FSDataInputStream fsDataInputStream=fs.open(new Path("/spring/Spring MVC.docx"));
        //FileOutputStream fileOutputStream=new FileOutputStream("e:/Spring MVC.docx");
        //IOUtils.copy(fsDataInputStream, fileOutputStream);
		
        // （2）fs.copyToLocalFile(new Path("/Spring MVC1111.docx"), new Path("e:/"));该方式依赖于本地hadoop环境，可以通过如下方式：参数1表示是否删除hdfs上的源文件，参数4表示是否使用java原生API
		fs.copyToLocalFile(false,new Path("/Spring MVC1111.docx"), new Path("e:/"),true);
		fs.close();
	}
	
	
	
	// 创建HDFS目录，根目录为/
	@Test
	public void makdirTest() throws Exception {
		boolean mkdirs = fs.mkdirs(new Path("/spring"));
		System.out.println(mkdirs);
	}
	
	@Test
	public void createTest() throws Exception{
		
	}
	
	// 删除目录或文件
	@Test
	public void deleteTest() throws Exception{
		// fs.exists():判断是否存在，可能不存在
		boolean file_exists = fs.exists(new Path("/spring"));  // true
		// fs.isFile():判断是否文件
		boolean file = fs.isFile(new Path("/Spring MVC.docx"));  // true
		// fs.isDirectory():判断一个对象是否是文件夹
		boolean directory1 = fs.isDirectory(new Path("/spring"));   // true
		boolean directory2 = fs.isDirectory(new Path("/Spring MVC.docx"));  // false
		
		System.out.println(file_exists);
		System.out.println(file);
		System.out.println(directory1);
		System.out.println(directory2);
		if (file_exists) {
			boolean delete = fs.delete(new Path("/spring"), true);//true， 递归删除
			System.out.println(delete);
		}
	}
	
	// 递归获得所有的文件
	@Test
	public void listTest() throws Exception{
		
		FileStatus[] listStatus = fs.listStatus(new Path("/"));
		for (FileStatus fileStatus : listStatus) {
			System.err.println(fileStatus.getPath()+"================="+fileStatus.toString());
		}
		
		//会递归找到所有的文件
		RemoteIterator<LocatedFileStatus> listFiles = fs.listFiles(new Path("/"), true);
		while(listFiles.hasNext()){
			LocatedFileStatus fileStatus = listFiles.next();
			System.out.println("blocksize：" + fileStatus.getBlockSize());
			System.out.println("owner：" + fileStatus.getOwner());
			System.out.println("Replication：" + fileStatus.getReplication());
			System.out.println("Permission：" + fileStatus.getPermission());
			System.out.println("Path：" + fileStatus.getPath());
			System.out.println("FileName：" + fileStatus.getPath().getName());
			System.out.println("File Len：" + fileStatus.getLen());
			BlockLocation[] blockLocations = fileStatus.getBlockLocations();
			for (BlockLocation blockLocation : blockLocations) {
				System.out.println("块起始偏移量：" + blockLocation.getOffset());
				System.out.println("块长度：" + blockLocation.getLength());
				String[] hosts = blockLocation.getHosts();
				for (String datanode : hosts) {
					// 这里需要注意，块副本打印的结果是3台机器，而我们hadoop配置的只有dfs.replication为2个副本
					// 那么为什么会出现3个副本啦？
					// 这里是因为client的Configuration和hadoop的hdfs-site.xml配置是独立且分开的，当前是client则配置依赖于Configuration，此处Configuration并没有指明dfs.replication的配置，则默认是3
					// 如果要改变该配置，则需要在文件上传时指明conf.set("dfs.replication", "2");
					System.out.println("块副本位置：" + datanode);
				}
			}
		}
		
	}
	
	
	// 显示hdfs上文件的内容
	@Test
	public void testCat() throws Exception{
		FSDataInputStream in = fs.open(new Path("/Spring MVC.docx"));
		IOUtils.copy(in, System.out);
	}
	
}