hadoop学习笔记（二）hadoop hdfs开发

最新推荐文章于 2023-05-17 07:45:22 发布

晴是有风

最新推荐文章于 2023-05-17 07:45:22 发布

阅读量254

点赞数 1

分类专栏： hadoop 文章标签： hadoop hdfs fs

版权声明：本文为博主原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接和本声明。

本文链接：https://blog.csdn.net/qq_34239412/article/details/85121383

版权

hadoop 专栏收录该内容

8 篇文章 2 订阅

订阅专栏

目录

对文件、目录的操作

查看文件列表、文件状态、文件位置、节点信息

文件压缩与解压缩

首先，必须运行hadoop，windows中在hadoop的路径下，sbin目录，start-all.cmd，会跳出四个命令行窗口，不要管它，缩小即可。

这个不开启的话，项目无法运行，会报错。还有不要刚开完就运行项目，会进入安全模式，无法正常运行，等一会就好了。

其实hdfs就是在代码中对hadoop服务器上的文件资源进行管理，不用代码直接在命令行中大部分操作也能做，而且有时会更方便。

hadoop fs shell命令官方手册：http://hadoop.apache.org/docs/r2.6.0/hadoop-project-dist/hadoop-common/FileSystemShell.html

hadoop hdfs的常用操作有：

我的项目结构：

上代码。

对文件、目录的操作

package hdfs;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.IOUtils;
import java.io.*;
import java.net.URI;

//对集群上的文件/目录的操作
public class FileOperation {
    //获取hadoop配置
    public Configuration getConf() {
        Configuration conf=new Configuration();
        return conf;
    }

    //获取文件系统对象
    public FileSystem getHDFS(String path,Configuration conf) throws IOException {
        FileSystem hdfs=FileSystem.get(URI.create(path),conf);
        return hdfs;
    }

    //1.上传文件到集群
    public String Upload(String src,String desHDFS,FileSystem hdfs){
        try{
            hdfs.copyFromLocalFile(new Path(src),new Path(desHDFS));
            return "上传成功！";
        }catch (Exception e){
            e.printStackTrace();
            return "上传失败！";
        }
    }

    //2.从集群下载文件到本地
    public String Download(String srcHDFS,String des,FileSystem hdfs){
        try{
            hdfs.copyToLocalFile(new Path(srcHDFS),new Path(des));
            return "下载成功！";
        }catch (Exception e){
            e.printStackTrace();
            return "下载失败！";
        }
    }

    //3.创建文件
    public String CreateFile(byte[] text,String des,FileSystem hdfs) {
        FSDataOutputStream out=null;
        try{
            out=hdfs.create(new Path(des));
            out.write(text,0,text.length);
            return "创建文件成功！";
        }catch (Exception e){
            e.printStackTrace();
            return "创建文件失败！";
        }
        finally {
            IOUtils.closeStream(out);
        }
    }

    //4.追加文件(src为源文件，inpath为要追加的文件)
//    public String Append(String src,String inpath,FileSystem hdfs,Configuration conf){
//        InputStream in=null;
//        OutputStream out=null;
//        try{
//            conf.set("dfs.support.append", "true");
//            conf.set("dfs.client.block.write.replace-datanode-on-failure.policy", "NEVER");
//            conf.set("dfs.client.block.write.replace-datanode-on-failure.enable", "true");
//
//            in = new BufferedInputStream(hdfs.open(new Path(inpath)));
//            out = hdfs.append(new Path(src));
//            IOUtils.copyBytes(in, out, 4096, true);
//            return "追加文件成功！";
//        }catch (Exception e){
//            e.printStackTrace();
//            return "追加文件失败！";
//        }
//        finally {
//            IOUtils.closeStream(in);
//            IOUtils.closeStream(out);
//        }
//    }

    //修改文件内容
    public void ModifyFile(byte[] text,String path,FileSystem hdfs) throws IOException {
        //判断该文件是否存在
        boolean result=hdfs.exists(new Path(path));
        if(result)
        {
            //删除文件
            hdfs.delete(new Path(path));
            //创建文件
            FSDataOutputStream out=hdfs.create(new Path(path));
            out.write(text,0,text.length);
            IOUtils.closeStream(out);
        }
    }

    //5.读取文件byte
    public byte[] ReadBytes(String file,FileSystem hdfs) throws IOException {
        InputStream in=hdfs.open(new Path(file));
        byte[] data=ToByteArray(in);
        return data;
    }
    private byte[] ToByteArray(InputStream in) throws IOException {
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        byte[] buffer = new byte[1024 * 4];//4K
        int n = 0;
        while ((n = in.read(buffer)) != -1) {
            out.write(buffer, 0, n);
        }
        return out.toByteArray();
    }
    //读取文件内容并打印输出(中文会乱码)
    public void ReadFile(String file,FileSystem hdfs,Configuration conf) throws IOException {
        InputStream in=hdfs.open(new Path(file));
//        BufferedReader bf=new BufferedReader(new InputStreamReader(in));
//        String line = null;
//        while ((line = bf.readLine()) != null) {
//            System.out.println(line);
//        }
        IOUtils.copyBytes(in,System.out,conf,true);
    }

    //6.重命名文件
    public String Rename(String src,String des,FileSystem hdfs){
        try{
            boolean result=hdfs.rename(new Path(src),new Path(des));
            if(result)
                return "重命名文件成功！";
            else
                return "重命名文件失败！";
        }catch (Exception e){
            e.printStackTrace();
            return "ERROR";
        }
    }

    //7.创建目录
    public String CreateDir(String dirpath,FileSystem hdfs){
        try{
            boolean result=hdfs.mkdirs(new Path(dirpath));
            if(result)
                return "创建目录成功！";
            else
                return "创建目录失败！";
        }catch (Exception e){
            e.printStackTrace();
            return "ERROR";
        }
    }

    //8.判断文件/目录是否存在
    public String CheckExist(String path,FileSystem hdfs){
        try{
            boolean result=hdfs.exists(new Path(path));
            if(result)
                return "文件/目录已存在！";
            else
                return "文件/目录不存在！";
        }catch (Exception e){
            e.printStackTrace();
            return "ERROR";
        }
    }

    //9.删除文件/目录
    public String Delete(String path,FileSystem hdfs){
        try{
            boolean result=hdfs.delete(new Path(path));
            if(result)
                return "删除成功！";
            else
                return "删除失败！";
        }catch (Exception e){
            e.printStackTrace();
            return "ERROR";
        }
    }

    public static void main(String[] args) throws IOException {
        FileOperation obj=new FileOperation();
        String hdfsRootPath="hdfs://localhost:9000/";
        String hdfsData="hdfs://localhost:9000/data";
        Configuration conf=obj.getConf();
        FileSystem hdfs=obj.getHDFS(hdfsRootPath,conf);

//        byte[] text1= obj.readBytes(hdfsData+"/testfile.txt",hdfs);
//        byte[] text2=obj.readBytes(hdfsData+"/testfile1.txt",hdfs);
//        //合并两个byte数组
//        byte[] text=new byte[text1.length+text2.length];
//        //源数组，源数组要复制的起始位置，目标数组要粘贴的起始位置，目标数组，要复制的长度。
//        System.arraycopy(text1,0,text,0,text1.length);
//        System.arraycopy(text2,0,text,text1.length,text2.length);
//       obj.ModifyFile(text,hdfsData+"/abc.txt",hdfs);

        obj.Delete(hdfsData+"/mapfile1",hdfs);
//        obj.Upload("C:/hadoop/data/FileOperation/bigFile.txt",hdfsData+"/bigFile.txt",hdfs);
//
//        obj.ReadFile(hdfsData+"/bigFile.txt",hdfs,conf);


    }
}

由于是服务器上的文件系统，不能直观地看到变换，所以以上的每步基本都需要在命令提示符中使用“hadoop fs -lsr /”查看更改

查看信息

查看文件列表、文件状态、文件位置、节点信息

package hdfs;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

//查看信息：查看文件列表、文件状态、文件位置、节点信息
public class ViewInfos {
    public static void main(String[] args) throws IOException, URISyntaxException {
        //加载hadoop配置
        Configuration conf=new Configuration();
        String hdfspath="hdfs://localhost:9000/";
        String hdfsData="hdfs://localhost:9000/data";
        //创建文件系统对象
        FileSystem hdfs=FileSystem.get(URI.create(hdfspath),conf);

        /*--------------------1.查看集群上的文件列表-------------------*/
//        //调用文件系统的操作方法
//        FileStatus[] files=hdfs.listStatus(new Path(hdfsData));
//        System.out.println("hdfs Data目录下的文件为：");
//        //输出查看hdfs/data目录下的文件
//        for(FileStatus file:files)
//            System.out.println(file.getPath());

        /*------------------2.查看集群上的文件/目录状态-----------------*/
//        //调用文件系统的操作方法
//        FileStatus[] files=hdfs.listStatus(new Path(hdfsData));
//        for(FileStatus file:files)
//            System.out.println(file.getPath() + " " +file.getModificationTime());

        /*--------------------3.查看集群上的文件位置-------------------*/
//        String filePath=hdfsData+"/testfile1.txt";
//        FileStatus fileStatus=hdfs.getFileStatus(new Path(filePath));
//        //获取文件块
//        BlockLocation[] blockLocations=hdfs.getFileBlockLocations(fileStatus,0,fileStatus.getLen());
//        //查看文件块位置，以及在哪些集群上
//        for(BlockLocation block : blockLocations){
//            String[] hosts=block.getHosts();
//            for(String host : hosts)
//                System.out.println("block:" + block + "; host:" + host);
//        }

        /*------------------------4.查看节点信息-----------------------*/
        //分布式文件系统
        DistributedFileSystem distributedHDFS= (DistributedFileSystem) FileSystem.get(new URI(hdfspath),conf);
        DatanodeInfo[] datanodeInfos=distributedHDFS.getDataNodeStats();
        for(DatanodeInfo datanode : datanodeInfos){
            System.out.println("host:"+datanode.getHostName());
            System.out.println("blockPoolUsed:"+datanode.getBlockPoolUsed());
        }
    }
}

文件压缩与解压缩

常见压缩格式有：

代码中使用的是gzip格式进行压缩与解压缩。

package hdfs;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.util.ReflectionUtils;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;

//文件压缩与解压缩
public class Compress {

    //使用GZip格式压缩
    public boolean CompressGZ(String src,String des,FileSystem hdfs,Configuration conf){
        FSDataInputStream in=null;
        FSDataOutputStream fsOut=null;
        CompressionOutputStream out=null;
        try {
            Class <?> codecClass = Class.forName("org.apache.hadoop.io.compress.GzipCodec");
            CompressionCodec codec= (CompressionCodec) ReflectionUtils.newInstance(codecClass,conf);
            //读取文件
            in=hdfs.open(new Path(src));
            //创建输出文件
            fsOut=hdfs.create(new Path(des));
            out = codec.createOutputStream(fsOut);
            IOUtils.copyBytes(in, out, conf);
            return true;
        }catch (Exception e){
            e.printStackTrace();
            return false;
        }finally {
            IOUtils.closeStream(in);
            IOUtils.closeStream(fsOut);
            IOUtils.closeStream(out);
        }
    }

    //使用GZip格式解压，并输出到控制台
    public boolean UnCompressGZ(String src,FileSystem hdfs,Configuration conf){
        FSDataInputStream in=null;
        InputStream inputStream=null;
        try {
            Class <?> codecClass = Class.forName("org.apache.hadoop.io.compress.GzipCodec");
            CompressionCodec codec= (CompressionCodec) ReflectionUtils.newInstance(codecClass,conf);
            //读取文件
            in=hdfs.open(new Path(src));
            inputStream=codec.createInputStream(in);
            IOUtils.copyBytes(inputStream, System.out, conf);
            return true;
        }catch (Exception e){
            e.printStackTrace();
            return false;
        }finally {
            IOUtils.closeStream(inputStream);
            IOUtils.closeStream(in);
        }
    }

    public static void main(String[] args) throws ClassNotFoundException, IOException {
        Configuration conf=new Configuration();
        String hdfspath="hdfs://localhost:9000/";
        String hdfsData="hdfs://localhost:9000/data";
        FileSystem hdfs=FileSystem.get(URI.create(hdfspath),conf);
        //指定压缩文件的来源路径及输出路径
        String src=hdfsData+"/bigFile.txt";
        String des=hdfsData+"/bigFile.txt.gz";

        Compress obj=new Compress();
//        obj.CompressGZ(src,des,hdfs,conf);
        obj.UnCompressGZ(des,hdfs,conf);
    }
}

序列化

Sequence File

package hdfs.serialization;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.util.ReflectionUtils;
import java.io.IOException;
import java.net.URI;

import static org.apache.hadoop.io.SequenceFile.createWriter;

//SequenceFile：解决小文件存储，将很多个小文件合并成一个大文件，以key-value的形式进行存储
//sequence file可分割，value可压缩，所占空间更小
public class SequenceFileDemo {

    public void ReadSequenceFile(String src,FileSystem hdfs,Configuration conf) throws IOException {
        SequenceFile.Reader reader= new SequenceFile.Reader(hdfs, new Path(src), conf);
        Writable key= (Writable) ReflectionUtils.newInstance(reader.getKeyClass(),conf);
        Writable value= (Writable) ReflectionUtils.newInstance(reader.getValueClass(),conf);
        long position=reader.getPosition();
        while (reader.next(key,value)){
            String syneSeen=reader.syncSeen() ? "*":"";
            System.out.println(position+" "+syneSeen+" "+key+" "+value);
            position=reader.getPosition();
        }
        IOUtils.closeStream(reader);
    }

    public void WriteSequenceFile(String[] data,String src,FileSystem hdfs,Configuration conf) throws IOException {
        IntWritable key=new IntWritable();
        Text value=new Text();
        SequenceFile.Writer writer= SequenceFile.createWriter(hdfs,conf,new Path(src),key.getClass(),value.getClass());
        int i=0;
        for(String row : data)
        {
            i++;
            key.set(i);
            value.set(row);
            System.out.println(writer.getLength()+" "+key+" "+value);
            writer.append(key,value);
        }
        IOUtils.closeStream(writer);
    }

    public static void main(String[] args) throws IOException {
        Configuration conf=new Configuration();
        String hdfspath="hdfs://localhost:9000/";
        FileSystem hdfs=FileSystem.get(URI.create(hdfspath),conf);

        String data[]={"One, two, Buckle my shoe","Three, four, Shut the front door","Five, six, Pick up sticks",
                        "Seven, eight, Lay them straight","Nine, ten, A big fat hen"};
        String src=hdfspath+"data/sequence.seq";

        SequenceFileDemo obj=new SequenceFileDemo();
//        obj.WriteSequenceFile(data,src,hdfs,conf);
        obj.ReadSequenceFile(src,hdfs,conf);
    }
}

MapFile

package hdfs.serialization;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.*;
import java.io.IOException;
import java.net.URI;

//与Sequence File兼容，类似，目录形式，Index-data(key&value)
public class MapFileDemo {

    public void WriteMapFile(String[] data,String src,FileSystem hdfs,Configuration conf) throws IOException {
        IntWritable key=new IntWritable();
        Text value=new Text();
        MapFile.Writer writer= new MapFile.Writer(conf,hdfs,src,key.getClass(),value.getClass());
        //每两行计一个index
        writer.setIndexInterval(2);
        int i=0;
        for(String row: data){
            i++;
            key.set(i);
            value.set(row);
            System.out.println(i+" "+row);
            writer.append(key,value);
        }
        IOUtils.closeStream(writer);
    }

    public void ReadMapFile(String src,FileSystem hdfs,Configuration conf) throws IOException {
        MapFile.Reader reader=new MapFile.Reader(hdfs,src,conf);
        int key=1;
        Text value=new Text();
        while (reader.next(new IntWritable(key),value)){
            System.out.println(key +" "+value);
            key++;
        }
        IOUtils.closeStream(reader);
    }

    public static void main(String[] args) throws IOException {
        Configuration conf=new Configuration();
        String hdfspath="hdfs://localhost:9000/";
        FileSystem hdfs=FileSystem.get(URI.create(hdfspath),conf);

        String data[]={"java,scala,python","hadoop,hive,hbase",
                        "spark,kafka,strom,sqoop", "spark sql,spark steaming,MLBase,MLlib"};
        String src=hdfspath+"data/mapfile";

        MapFileDemo obj=new MapFileDemo();
        obj.WriteMapFile(data,src,hdfs,conf);
//        obj.ReadMapFile(src,hdfs,conf);
    }
}

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
hadoop学习笔记（二）hadoop hdfs开发

目录对文件、目录的操作查看信息查看文件列表、文件状态、文件位置、节点信息文件压缩与解压缩序列化Sequence FileMapFile首先，必须运行hadoop，windows中在hadoop的路径下，sbin目录，start-all.cmd，会跳出四个命令行窗口，不要管它，缩小即可。这个不开启的话，项目无法运行，会报错。还有不要刚开完就运行项目，会进入安全模...
复制链接

扫一扫

专栏目录

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。