hadoop入门2

最新推荐文章于 2024-11-17 23:54:03 发布

子非我104

最新推荐文章于 2024-11-17 23:54:03 发布

阅读量487

点赞数

文章标签： hadoop 大数据 hdfs

本文链接：https://blog.csdn.net/qq_61162288/article/details/130589770

版权

FileSystem使用

核心类 org.apache.hadoop.fs.FileSystem 文件系统类抽象类

//静态方法创建对象
public static FileSystem newInstance(URI uri,Configuration conf,String user)
/*
          参数一  URI  分布式文件系统 HDFS的资源地址 NN地址 hdfs://linux01:8020
          参数二  Configuration  用户自定义参数设置   副本数 3   物理切块的大小 128M
                 
          参数三  user  客户端用户名      
 */

/*
      org.apache.hadoop.fs.FileSystem  文件系统类  抽象类
         静态方法获取对象(子类对象)
              public static FileSystem newInstance(URI uri,Configuration conf,String user)
                    URI uri: 统一资源标识符   协议://
                                url统一资源定位符   www.baidu.com
                                只要是网络相关的 都是uri包括url
                                迅雷下载 百度网盘 邮件发送 mailto:  jdbc连接
                             分布式文件系统 HDFS的资源地址 NN地址
                             hdfs://linux01:8020
                             构造方法
                                  public URI uri
                   Configuration conf:用户自定义参数设置   副本数 3   物理切块的大小 128M
                                      如果不设置 使用默认设置
                   String user: 用户名  root
 */
public class Demo01_FileSystem {
    public static void main(String[] args) throws URISyntaxException, IOException, InterruptedException {
        //文件系统的客户端连接对象
        URI  uri = new URI("hdfs://linux01:8020");
        //配置对象 没有进行配置 使用默认配置
        Configuration  con = new Configuration();
        //用户名
        String user = "root";
        //通过静态方法 获取 分布式文件系统对象
        FileSystem fs = FileSystem.newInstance(uri, con, user);
        System.out.println(fs);
    }
}

常用方法

public void copyFromLocalFile(Path src, Path dst)  将本地文件上传到文件系统
public void copyFromLocalFile(boolean delSrc, Path src, Path dst)
public void copyToLocalFile(Path src, Path dst)    将文件系统上的文件下载到本地
public void copyToLocalFile(boolean delSrc, Path src, Path dst)
public FileStatus[] listStatus(Path f) 列出目录下所有的内容 包括文件  文件夹
public RemoteIterator<LocatedFileStatus> listFiles(Path f, boolean recursive)列出目录下所有的文件
public FSDataOutputStream create(Path f, boolean overwrite)  获取字节输出流 向文件中写数据
public FSDataOutputStream append(Path path) 向指定的文件路径中追加写入数据
public FSDataInputStream open(Path f) 获取字节输入流 读取文件中数据

获取文件系统对象工具类

public class HDFSUtils {
    private HDFSUtils(){}

    public static FileSystem  getFileSystem() throws  Exception {
        //文件系统的客户端连接对象
        URI uri = new URI("hdfs://linux01:8020");
        //配置对象 没有进行配置 使用默认配置
        Configuration con = new Configuration();
        //用户名
        String user = "root";
        //通过静态方法 获取 分布式文件系统对象
        FileSystem fs = FileSystem.newInstance(uri, con, user);
        return fs;
    }
}

上传文件

/*
     public void copyFromLocalFile(Path src, Path dst)  将本地文件上传到文件系统
     public void copyFromLocalFile(boolean delSrc, Path src, Path dst)
            boolean delSrc:是否删除源文件  true删除 false则不删除
            Path src:数据源  本地系统上的文件
            Path dst:数据目的 文件系统

     1.创建FileSystem对象
     2.调用方法上传文件到分布式文件系统
     3.关闭资源
 */
public class Demo02_FileSystem {
    public static void main(String[] args) throws Exception {
        //通过工具类获取对象
        FileSystem fileSystem = HDFSUtils.getFileSystem();

        /*
            public void copyFromLocalFile(Path src, Path dst)  将本地文件上传到文件系统
            数据源:本地文件  d:\\mm.jpg
            数据目的:分布式文件系统  /
         */
      //上传并改名
        Path  src = new Path("d:\\mm.jpg");
        Path  dest = new Path("/java/meimei.jpg");

        fileSystem.copyFromLocalFile(true,src,dest);

        fileSystem.close();    
    }
}

下载文件

/*
    将文件系统上的文件下载到本地
    public void copyToLocalFile(Path src, Path dst)
    public void copyToLocalFile(boolean delSrc, Path src, Path dst)

                boolean delSrc: 是否删除数据源  true删除 false不删除
                Path src:数据源 文件系统
                Path dst:数据目的 本地

    注意:由于win和HDFS分布式兼容不好 需要安装Hadoop环境 不安装不能下载文件
        解压 hadoop3.1.1 配置环境变量HADOOP_HOME 加入Path
        需要重启idea  可能需要重启电脑
 */
public class Demo03_FileSystem {
    public static void main(String[] args) throws Exception {
        //通过工具类获取对象
        FileSystem fs = HDFSUtils.getFileSystem();


//        Path  src = new Path("/mm.jpg");
//        Path  dest = new Path("d:\\");
        //将文件系统中的文件 下载到本地  自动生成.crc的检验文件
//        fs.copyToLocalFile(src,dest);

        Path  src = new Path("/mm.jpg");
        Path  dest = new Path("d:\\meimei.jpg");
        下载并改名
      //  fs.copyToLocalFile(src,dest);

        /*
            public void copyToLocalFile(boolean delSrc, Path src, Path dst, boolean useRawLocalFileSystem)
            参数1:是否删除源文件
            参数2:数据源
            参数3:数据目的
            参数4:是否使用本地文件系统 true则不生成.crc文件
         */
        fs.copyToLocalFile(false,src,dest,true);
        fs.close();
    }
}

遍历文件

/*
    public FileStatus[] listStatus(Path f) 列出指定目录下所有的内容 包括文件  文件夹
 */
public class Demo04_FileSystem {
    public static void main(String[] args) throws Exception {
        //获取对象
        FileSystem fs = HDFSUtils.getFileSystem();
        //获取根目录下所有内容 文件 文件夹
        FileStatus[] fileStatusArr = fs.listStatus(new Path("/"));
        //增强for循环遍历
        for(FileStatus file : fileStatusArr){
//            System.out.println(file);
            //判断是否是文件夹
            boolean b = file.isDirectory();
            //判断是否是文件
            boolean b2 = file.isFile();
            //获取文件路径
            Path path = file.getPath();
            if(b){
                System.out.println("文件夹:"+path);
            }else{
                System.out.println("文件:"+path);
            }
        }
    }
}

/*
    public RemoteIterator<LocatedFileStatus> listFiles(Path f, boolean recursive)列出目录下所有的文件 只获取文件
                方法参数
                    Path f:路径
                    boolean recursive:是否递归遍历  true false

                方法返回值
                    RemoteIterator 迭代器
                          hasNext()  判断是否有元素
                          next()  获取元素

              
                    LocatedFileStatus  分布式文件系统上的文件对象 可以获取文件的信息
                                       获取文件大小  副本个数  block块个数 大小等等

                                 Path getPath() 获取路径
                                 long getLen()  获取文件的字节数
                                 short getReplication()获取副本个数
                                 long  getBlockSize() 获取block块的大小
                                 BlockLocation[] getBlockLocations() 获取block块数组

                    BlockLocation  物理切块对象
                           String[]   getHosts()  获取block块在主机的位置
                           String[]   getNames()  获取block块在主机的名称端口
                           long   getLenth()  获取每个切块的大小
                           long   getOffset() 获取偏移量
 */
public class Demo05_FileSystem {
    public static void main(String[] args) throws Exception {
        FileSystem fs = HDFSUtils.getFileSystem();

        //调用方法获取根目录下的所有内容
        RemoteIterator<LocatedFileStatus> it = fs.listFiles(new Path("/"), true);
        while (it.hasNext()) {
            LocatedFileStatus file = it.next();
//            System.out.println(file.getPath());

            //获取路径
            Path path = file.getPath();
            //获取文件的字节数
            long len = file.getLen();
            System.out.println("文件大小:" + len * 1.0 / 1024 / 1024 + "M");
            //获取文件的副本
            short s = file.getReplication();
            System.out.println("副本个数:" + s);
            //获取block大小
            long blockSize = file.getBlockSize();
            System.out.println("block块大小:" + blockSize * 1.0 / 1024 / 1024 + "M");

            //获取block的数组
            BlockLocation[] blockLocations = file.getBlockLocations();
            System.out.println("block块个数:" + blockLocations.length);


            //遍历物理切块数组
            for (BlockLocation b : blockLocations) {
                //获取物理切块的主机地址
                String[] hosts = b.getHosts();
                System.out.println(Arrays.toString(hosts));
                String[] names = b.getNames();
                System.out.println(Arrays.toString(names));
                long length = b.getLength();
                System.out.println("block大小:" + length * 1.0 / 1024 / 1024 + "M");
                long offset = b.getOffset();
                System.out.println("偏移量:" + offset);

            }
            System.out.println("------------------------------");
        }

    }
}

写数据

/*
    public FSDataOutputStream create(Path f, boolean overwrite)  获取字节输出流 向文件中写数据
                boolean overwrite:如果文件存在 是否覆盖  true 覆盖 false不覆盖
 */
public class Demo06_FileSystem {
    public static void main(String[] args) throws Exception {
        FileSystem fs = HDFSUtils.getFileSystem();

        //获取写数据的字节输出流
        FSDataOutputStream out = fs.create(new Path("/1.txt"), true);

        out.write("hello world\r\n".getBytes());

        out.write("hello boys  ".getBytes());
        out.write("hello girls  ".getBytes());

        out.close();
        fs.close();
    }
}

/*
    追加写入数据
      FSDataOutputStream append(Path path) 向指定的文件路径中追加写入数据
 */
public class Demo07_FileSystem {
    public static void main(String[] args) throws Exception {
        FileSystem fs = HDFSUtils.getFileSystem();

        //向文件中追加写入数据
        FSDataOutputStream out = fs.append(new Path("/1.txt"));
        out.write("hello  aaa\r\n".getBytes());
        out.write("hello  bbb\r\n".getBytes());
        out.write("hello  ccc\r\n".getBytes());

        out.close();
        fs.close();

    }
}

读数据

/*
    public FSDataInputStream open(Path f) 获取字节输入流 读取文件中数据
 */
public class Demo08_FileSystem {
    public static void main(String[] args) throws Exception {
        FileSystem fs = HDFSUtils.getFileSystem();

        //获取字节输入流 读取文件中数据
        FSDataInputStream in = fs.open(new Path("/1.txt"));

//        byte[] bytes = new byte[1024];
//        int len =0;
//        while((len = in.read(bytes))!=-1){
//            System.out.println(new String(bytes,0,len));
//
//        }
        //一行一行读取数据
        InputStreamReader isr = new InputStreamReader(in);
        //创建缓冲流
        BufferedReader br = new BufferedReader(isr);
//        String s = br.readLine();
        String line = null;
        while((line = br.readLine())!=null){
            System.out.println(line);
        }
        br.close();
        fs.close();

    }
}

/*
    seek和skip方法补充
 */
public class Demo09_FileSystem {
    public static void main(String[] args) throws Exception {
        FileSystem fs = HDFSUtils.getFileSystem();

        //获取字节输入流 读取文件中数据
        FSDataInputStream in = fs.open(new Path("/1.txt"));

        //跳过几个字节
       // in.skip(1);
        //指定指针标记读取
        in.seek(0);

        int read = in.read();
        System.out.println(read);

        in.close();

        fs.close();
    }
}

其他方法

//DistributedFileSystem
public class Demo10_FileSystem {
    public static void main(String[] args) throws Exception {
        FileSystem fileSystem = HDFSUtils.getFileSystem();

        fileSystem.delete(new Path("/1.txt"),true);  //删除
        fileSystem.mkdirs(new Path("/aaa/bbb") ); //创建文件夹
        fileSystem.exists(new Path("/aaa/1.txt")); //判断路径是否存在

        //...
      fileSystem.close();
    }
}

小文件合并

由于Hadoop擅长存储大文件，因为大文件的元数据信息比较少，如果Hadoop集群当中有大量的小文件，那么每个小文件都需要维护一份元数据信息，会大大的增加集群管理元数据的内存压力，所以在实际工作当中，如果有必要一定要将小文件合并成大文件进行一起处理在我们的HDFS的shell命令模式下，可以通过命令行将很多的hdfs文件合并成一个大文件下载到本地.

 hdfs dfs -getmerge /aaa/* ./abc.txt

既然可以在下载的时候将这些小文件合并成一个大文件一起下载，那么肯定就可以在上传的时候将小文件合并到一个大文件里面去

     FileSystem fs = HDFSUtils.getFS();

        FSDataOutputStream out = fs.create(new Path("/aaa/big.txt"));

        LocalFileSystem local = FileSystem.getLocal(new Configuration());
        FileStatus[] fileStatuses = local.listStatus(new Path("file:///d:\\input"));

        for (FileStatus fileStatus : fileStatuses) {

            FSDataInputStream in = local.open(fileStatus.getPath());
            IOUtils.copy(in,out);

            IOUtils.closeQuietly(in);
        }

        IOUtils.closeQuietly(out);

参数配置

使用Configuration类进行配置

       Configuration conf = new Configuration();
        // 修改存储的副本个数  5  name  value
        conf.set( "dfs.replication", "5");
        // 修改物理切块的大小
        conf.set("dfs.blocksize", "64m");

使用配置文件进行配置

在resources下创建配置文件hdfs-site.xml

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->
<configuration>
        <property>
             <name>dfs.replication</name>
                 <value>5</value>
        </property>
        <property>
             <name>dfs.blocksize</name>
                 <value>64m</value>
        </property>
</configuration>