HDFS API

HDFS API

1. maven

https://mvnrepository.com/

idea
配置

<?xml version="1.0" encoding="UTF-8"?>

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>com.csz.bigdata</groupId>
  <artifactId>csz-hadoop</artifactId>
  <version>1.0</version>

  <name>csz-hadoop</name>
  <!-- FIXME change it to the project's website -->
  <url>http://www.example.com</url>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <maven.compiler.source>1.8</maven.compiler.source>
    <maven.compiler.target>1.8</maven.compiler.target>
    <hadoop.version>2.6.0-cdh5.16.2</hadoop.version>
  </properties>



  <repositories>
    <!-- 阿里云仓库 -->
    <repository>
      <id>aliyun</id>
      <url>http://maven.aliyun.com/nexus/content/groups/public</url>
    </repository>

    <!-- CDH仓库 -->
    <repository>
      <id>cloudera</id>
      <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
    </repository>
  </repositories>


  <dependencies>
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-client</artifactId>
      <version>${hadoop.version}</version>
    </dependency>

    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.12</version>
      <scope>test</scope>
    </dependency>
  </dependencies>



  <build>
    <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
      <plugins>
        <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
        <plugin>
          <artifactId>maven-clean-plugin</artifactId>
          <version>3.1.0</version>
        </plugin>
        <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
        <plugin>
          <artifactId>maven-resources-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-compiler-plugin</artifactId>
          <version>3.8.0</version>
        </plugin>
        <plugin>
          <artifactId>maven-surefire-plugin</artifactId>
          <version>2.22.1</version>
        </plugin>
        <plugin>
          <artifactId>maven-jar-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-install-plugin</artifactId>
          <version>2.5.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-deploy-plugin</artifactId>
          <version>2.8.2</version>
        </plugin>
        <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
        <plugin>
          <artifactId>maven-site-plugin</artifactId>
          <version>3.7.1</version>
        </plugin>
        <plugin>
          <artifactId>maven-project-info-reports-plugin</artifactId>
          <version>3.0.0</version>
        </plugin>
      </plugins>
    </pluginManagement>
  </build>
</project>

2. HDFSAPI idea开发

2.1 mkdir

idea

package com.csz.bigdata.hadoop;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

import java.net.URI;

public class HDFSAPITest02 {

    FileSystem fileSystem;

    @Before
    public void setUp() throws Exception{
        Configuration conf = new Configuration();

        URI uri = new URI("hdfs://hadoop001:9000");
        fileSystem = FileSystem.get(uri, conf, "hadoop");
    }

    @After
    public void tearDown() throws Exception{
          if(null != fileSystem){
              fileSystem.close();
          }
    }

    /**
     * 创建文件hdfsapi在hdfs上
     */
    @Test
    public void mkdir() throws Exception{
        Path path = new Path("/hdfsapi");
        fileSystem.mkdirs(path);
    }
    /**
     * 从本地copy文件到hdfs
     */
    @Test
    public void copyFromLocalFile() throws Exception{

        Path src = new Path("data/cszdata.txt");
        Path dst = new Path("/hdfsapi3/cszdata");
        fileSystem.copyFromLocalFile(src, dst);
    }
}

2.2 文件传导

package com.csz.bigdata.hadoop;

import com.amazonaws.services.identitymanagement.model.transform.ListInstanceProfilesResultStaxUnmarshaller;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.IOUtils;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;


import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.net.URI;

public class HDFSAPITest03 {

    FileSystem fileSystem;

    @Before
    public void setUp() throws Exception{
        Configuration conf = new Configuration();
        conf.set("dfs.client.use.datanode.hostname","true");
        conf.set("dfs.replication","1");
        URI uri = new URI("hdfs://hadoop001:9000");
        fileSystem = FileSystem.get(uri, conf, "hadoop");
    }

    /**
     * IO流向hdfs上传文件
     * @throws Exception
     */
    @Test
    public void copyFromLocalFile() throws Exception{
        BufferedInputStream in = new BufferedInputStream(new FileInputStream(new File("data/cszdata.txt")));

        FSDataOutputStream out = fileSystem.create(new Path("/hdfsapi/pk/io"));

        // in ==> out
        IOUtils.copyBytes(in, out, 4096);

        IOUtils.closeStream(in);
        IOUtils.closeStream(out);
    }

    /**
     * IO流从hdfs获取文件
     * @throws Exception
     */
    @Test
    public void copyToLocalFile() throws Exception{
        FSDataInputStream in = fileSystem.open(new Path("/hdfsapi/pk/io"));
        FileOutputStream out = new FileOutputStream(new File("out/io-out.txt"));
        IOUtils.copyBytes(in, out, fileSystem.getConf());

        IOUtils.closeStream(in);
        IOUtils.closeStream(out);
    }


    /**
     * 读第0个块
     */
    @Test
    public void downLoad() throws  Exception{
        FSDataInputStream in = fileSystem.open(new Path("/hdfsapi/spark-2.4.6-bin-hadoop2.6.tgz"));
        FileOutputStream out = new FileOutputStream(new File("out/spark.tgz.part0"));

        //读第一个块 0-128 M
        //字节数组
        byte[] buffer = new byte[2048];
        for (int i = 0; i <1024*1024*128/2048; i++){
            in.read(buffer);
            out.write(buffer);
        }
        IOUtils.copyBytes(in, out, fileSystem.getConf());

        IOUtils.closeStream(in);
        IOUtils.closeStream(out);
    }
    /**
     * 读第1个块
     */
    @Test
    public void downLoad1() throws  Exception{
        FSDataInputStream in = fileSystem.open(new Path("/hdfsapi/spark-2.4.6-bin-hadoop2.6.tgz"));
        FileOutputStream out = new FileOutputStream(new File("out/spark.tgz.part1"));

        in.seek(1024*1024*128);
        IOUtils.copyBytes(in, out, fileSystem.getConf());

        IOUtils.closeStream(in);
        IOUtils.closeStream(out);
    }
}

3. 报错处理

3.1 replication报错处理
  1. 若报错
    replication的问题 是云主机的问题
    需要在hadoop的hdfs-site.xml
<property>
	<name>dfs.datanode.use.datanode.hostname</name>
	<value>true</value>
</property>

然后再在客户端的setUp里添加

conf.set("dfs.client.use.datanode.hostname");
  1. 为什么创建的文件副本是3
    因为他走的idea maven包里的hdfs-default.xml 里的副本设置是3 如下
<property>
  <name>dfs.replication</name>
  <value>3</value>
  <description>Default block replication. 
  The actual number of replications can be specified when the file is created.
  The default is used if replication is not specified in create time.
  </description>
</property>
3.2 windows端没有安装Hadoop

安装hadoop
在这里插入图片描述
配置好环境

1)下载winutils.exe和hadoop.dll,添加到HADOOP_HOME,并添加到PATH中 (注意Hadoop版本):https://github.com/steveloughran/winutils/tree/master/hadoop-2.7.1/bin
2)放这两个在%HADOOP_HOME%\bin下,同时hadoop.dll也放在C:\Windows\System32目录下

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值