文章目录
HDFS API
1. maven
https://mvnrepository.com/
idea
配置
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.csz.bigdata</groupId>
<artifactId>csz-hadoop</artifactId>
<version>1.0</version>
<name>csz-hadoop</name>
<!-- FIXME change it to the project's website -->
<url>http://www.example.com</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<hadoop.version>2.6.0-cdh5.16.2</hadoop.version>
</properties>
<repositories>
<!-- 阿里云仓库 -->
<repository>
<id>aliyun</id>
<url>http://maven.aliyun.com/nexus/content/groups/public</url>
</repository>
<!-- CDH仓库 -->
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
<plugins>
<!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.1.0</version>
</plugin>
<!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.0</version>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.1</version>
</plugin>
<plugin>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
</plugin>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.8.2</version>
</plugin>
<!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
<plugin>
<artifactId>maven-site-plugin</artifactId>
<version>3.7.1</version>
</plugin>
<plugin>
<artifactId>maven-project-info-reports-plugin</artifactId>
<version>3.0.0</version>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>
2. HDFSAPI idea开发
2.1 mkdir
idea
package com.csz.bigdata.hadoop;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import java.net.URI;
public class HDFSAPITest02 {
FileSystem fileSystem;
@Before
public void setUp() throws Exception{
Configuration conf = new Configuration();
URI uri = new URI("hdfs://hadoop001:9000");
fileSystem = FileSystem.get(uri, conf, "hadoop");
}
@After
public void tearDown() throws Exception{
if(null != fileSystem){
fileSystem.close();
}
}
/**
* 创建文件hdfsapi在hdfs上
*/
@Test
public void mkdir() throws Exception{
Path path = new Path("/hdfsapi");
fileSystem.mkdirs(path);
}
/**
* 从本地copy文件到hdfs
*/
@Test
public void copyFromLocalFile() throws Exception{
Path src = new Path("data/cszdata.txt");
Path dst = new Path("/hdfsapi3/cszdata");
fileSystem.copyFromLocalFile(src, dst);
}
}
2.2 文件传导
package com.csz.bigdata.hadoop;
import com.amazonaws.services.identitymanagement.model.transform.ListInstanceProfilesResultStaxUnmarshaller;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.IOUtils;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.net.URI;
public class HDFSAPITest03 {
FileSystem fileSystem;
@Before
public void setUp() throws Exception{
Configuration conf = new Configuration();
conf.set("dfs.client.use.datanode.hostname","true");
conf.set("dfs.replication","1");
URI uri = new URI("hdfs://hadoop001:9000");
fileSystem = FileSystem.get(uri, conf, "hadoop");
}
/**
* IO流向hdfs上传文件
* @throws Exception
*/
@Test
public void copyFromLocalFile() throws Exception{
BufferedInputStream in = new BufferedInputStream(new FileInputStream(new File("data/cszdata.txt")));
FSDataOutputStream out = fileSystem.create(new Path("/hdfsapi/pk/io"));
// in ==> out
IOUtils.copyBytes(in, out, 4096);
IOUtils.closeStream(in);
IOUtils.closeStream(out);
}
/**
* IO流从hdfs获取文件
* @throws Exception
*/
@Test
public void copyToLocalFile() throws Exception{
FSDataInputStream in = fileSystem.open(new Path("/hdfsapi/pk/io"));
FileOutputStream out = new FileOutputStream(new File("out/io-out.txt"));
IOUtils.copyBytes(in, out, fileSystem.getConf());
IOUtils.closeStream(in);
IOUtils.closeStream(out);
}
/**
* 读第0个块
*/
@Test
public void downLoad() throws Exception{
FSDataInputStream in = fileSystem.open(new Path("/hdfsapi/spark-2.4.6-bin-hadoop2.6.tgz"));
FileOutputStream out = new FileOutputStream(new File("out/spark.tgz.part0"));
//读第一个块 0-128 M
//字节数组
byte[] buffer = new byte[2048];
for (int i = 0; i <1024*1024*128/2048; i++){
in.read(buffer);
out.write(buffer);
}
IOUtils.copyBytes(in, out, fileSystem.getConf());
IOUtils.closeStream(in);
IOUtils.closeStream(out);
}
/**
* 读第1个块
*/
@Test
public void downLoad1() throws Exception{
FSDataInputStream in = fileSystem.open(new Path("/hdfsapi/spark-2.4.6-bin-hadoop2.6.tgz"));
FileOutputStream out = new FileOutputStream(new File("out/spark.tgz.part1"));
in.seek(1024*1024*128);
IOUtils.copyBytes(in, out, fileSystem.getConf());
IOUtils.closeStream(in);
IOUtils.closeStream(out);
}
}
3. 报错处理
3.1 replication报错处理
- 若报错
replication的问题 是云主机的问题
需要在hadoop的hdfs-site.xml
<property>
<name>dfs.datanode.use.datanode.hostname</name>
<value>true</value>
</property>
然后再在客户端的setUp里添加
conf.set("dfs.client.use.datanode.hostname");
- 为什么创建的文件副本是3
因为他走的idea maven包里的hdfs-default.xml 里的副本设置是3 如下
<property>
<name>dfs.replication</name>
<value>3</value>
<description>Default block replication.
The actual number of replications can be specified when the file is created.
The default is used if replication is not specified in create time.
</description>
</property>
3.2 windows端没有安装Hadoop
安装hadoop
配置好环境
1)下载winutils.exe和hadoop.dll,添加到HADOOP_HOME,并添加到PATH中 (注意Hadoop版本):https://github.com/steveloughran/winutils/tree/master/hadoop-2.7.1/bin
2)放这两个在%HADOOP_HOME%\bin下,同时hadoop.dll也放在C:\Windows\System32目录下