hadoop API之:文件操作

hadoop API之:文件操作

@(HADOOP)[hadoop, hadoop2]

Hadoop提供了大量的API对文件系统中的文件进行操作,主要包括:

(1)读取文件

(2)写文件

(3)读取文件属性

(4)列出文件

(5)删除文件

完整代码见:https://github.com/lujinhong/lujinhong-commons/tree/master/lujinhong-commons-hadoop/src/main/java/com/lujinhong/commons/hadoop/fs

1、读取文件

以下示例中,将hdfs中的一个文件读取出来,并输出到标准输出流中。

package org.jediael.hadoopdemo.fsdemo;

import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;

public class FileSystemDoubleCat {

    public static void main(String[] args) throws IOException {

        String fileName = args[0];
        Configuration conf = new Configuration();

        FileSystem fs = FileSystem.get(URI.create(fileName), conf);
        FSDataInputStream in = null;
        try {
            in = fs.open(new Path(fileName));
            IOUtils.copyBytes(in, System.out, 4096, false);
            in.seek(0);
            IOUtils.copyBytes(in, System.out, 4096, false);
        } finally {
            in.close();
        }

    }

}

(1)其中FSDataInputStream实现了Seekable接口,可以对文件进行随机定位,但注意,seek()的代价较高,如无必要,尽量少使用。

2、文件复制

package org.jediael.hadoopdemo.fsdemo;

import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;

public class FileCopy {

    public static void main(String[] args) throws IOException {
        String sourceFile = args[0];
        String destFile = args[1];

        InputStream in = null;
        OutputStream out = null;
        try {
            //1、准备输入流
            in = new BufferedInputStream(new FileInputStream(sourceFile));
            //2、准备输出流
            Configuration conf = new Configuration();
            FileSystem fs = FileSystem.get(URI.create(destFile), conf);
            out = fs.create(new Path(destFile));
            //3、复制
            IOUtils.copyBytes(in, out, 4096, false);
        } finally {
            in.close();
            out.close();
        }

    }

}

3、获取文件属性

文件属性以FileStatus对象进行封装,使用FileSystem对象的getFileStatus()方法,可以获取到文件的FileStatus对象。

package org.jediael.hadoopdemo.fsdemo;

import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

public class FileStatusDemo {

    public static void main(String[] args) throws IOException {

        String fileName = args[0];

        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(URI.create(fileName), conf);
        //获取FileSystem对象。
        FileStatus status = fs.getFileStatus(new Path(fileName));
        System.out.println(status.getOwner()+" "+status.getModificationTime());


    }

}

4、列出某个目录下的文件

使用FileSystem的ListStatus方法,可以获取到某个目录下所有文件的FileStatus对象。

package org.jediael.hadoopdemo.fsdemo;

import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;

public class ListStatusDemo {

    public static void main(String[] args) throws IOException {

        String dir = args[0];

        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(URI.create(dir), conf);
        FileStatus[] stats =  fs.listStatus(new Path(dir));

        Path[] paths = FileUtil.stat2Paths(stats);
        for(Path path : paths){
            System.out.println(path);
        }
    }

}

递归列出目录下的所有文件(2.0版本以后适用):

//递归列出目录中的所有文件。
public static List<String> getAllHdfsFile(String dir) throws IOException {
    List<String> fileList = new ArrayList<>();
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(URI.create(dir), conf);

    RemoteIterator<LocatedFileStatus> iterator = fs.listFiles(
            new Path(dir), true);

    while (iterator.hasNext()) {
        LocatedFileStatus fileStatus = iterator.next();
        fileList.add(fileStatus.getPath().toString());
    }

// for(String file : fileList){
// LOG.debug(file);
// }
return fileList;

}

5、读取sequencefile

package com.lujinhong.commons.hadoop.fs;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.ReflectionUtils;

import java.io.IOException;
import java.net.URI;

/**
 * AUTHOR: LUJINHONG
 * CREATED ON: 17/1/11 11:32
 * PROJECT NAME: lujinhong-commons
 * DESCRIPTION:示范如何读取以snappy格式压缩的。虽然没指定压缩格式,但成功解压了。
 */
public class SequenceSnappyFileReader {
    public static void main(String[] args) throws IOException {
        String uri = args[0];
        Configuration conf = new Configuration();
        Path path = new Path(uri);
        SequenceFile.Reader reader = null;
        try {
            SequenceFile.Reader.Option filePath = SequenceFile.Reader.file(path);
            reader = new SequenceFile.Reader(conf, filePath);
            Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
            Writable value = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf);
            //long position = reader.getPosition();
            while (reader.next(key, value)) {
                //同步记录的边界
                //String syncSeen = reader.syncSeen() ? "*" : "";
                //System.out.printf("[%s%s]\t%s\t%s\n", position, syncSeen, key, value);
                System.out.println( value);
                //position = reader.getPosition(); // beginning of next record
            }
        } finally {
            IOUtils.closeStream(reader);
        }
    }
}

6、读取HDFS文件

public static void main(String[] args) throws IOException {
    String fileName = args[0];
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(URI.create(fileName), conf);
    FSDataInputStream hdfsInStream = fs.open(new Path(fileName));

    String line;
    BufferedReader in =new BufferedReader(new InputStreamReader(hdfsInStream, "UTF-8"));
    while ((line = in.readLine()) != null) {
    }
}
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值