上一篇,我们搭建了hadoop的集群环境(双节点)
hadoop天生就是集群,哪怕只有一个节点也是个单节点的集群,在hadoop中底层默认使用了HDFS文件系统,mapreduce是基于HDFS文件系统上的运行模型(框架),而yarn是hadoop2.x版本后从mapreduce框架中分离出的资源调度框架,关于yarn框架我们后面再细说。
咳咳,现在回到正题。
在hadoop中HDFS是自带(默认)的分布式文件系统,它能够存储极其丰富的海量的数据,并提供负载均衡及副本容错机制(非常优秀)。HDFS同时提供了web (restFull )api 、java api和命令行api,供用户操作使用(上传、下载文件等操作)
hdfs 命令行api(其实就是hadoop的linux shell命令) :
hadoop fs # 回车后会显示命令帮助
[hadoop@hadoopNode01 ~]$ hadoop fs
Usage: hadoop fs [generic options]
[-appendToFile <localsrc> ... <dst>]
[-cat [-ignoreCrc] <src> ...]
[-checksum <src> ...]
[-chgrp [-R] GROUP PATH...]
[-chmod [-R] <MODE[,MODE]... | OCTALMODE> PATH...]
[-chown [-R] [OWNER][:[GROUP]] PATH...]
[-copyFromLocal [-f] [-p] [-l] <localsrc> ... <dst>]
[-copyToLocal [-p] [-ignoreCrc] [-crc] <src> ... <localdst>]
[-count [-q] [-h] <path> ...]
[-cp [-f] [-p | -p[topax]] <src> ... <dst>]
[-createSnapshot <snapshotDir> [<snapshotName>]]
[-deleteSnapshot <snapshotDir> <snapshotName>]
[-df [-h] [<path> ...]]
[-du [-s] [-h] <path> ...]
[-expunge]
[-find <path> ... <expression> ...]
[-get [-p] [-ignoreCrc] [-crc] <src> ... <localdst>]
[-getfacl [-R] <path>]
[-getfattr [-R] {-n name | -d} [-e en] <path>]
[-getmerge [-nl] <src> <localdst>]
[-help [cmd ...]]
[-ls [-d] [-h] [-R] [<path> ...]]
[-mkdir [-p] <path> ...]
[-moveFromLocal <localsrc> ... <dst>]
[-moveToLocal <src> <localdst>]
[-mv <src> ... <dst>]
[-put [-f] [-p] [-l] <localsrc> ... <dst>]
[-renameSnapshot <snapshotDir> <oldName> <newName>]
[-rm [-f] [-r|-R] [-skipTrash] <src> ...]
[-rmdir [--ignore-fail-on-non-empty] <dir> ...]
[-setfacl [-R] [{-b|-k} {-m|-x <acl_spec>} <path>]|[--set <acl_spec> <path>]]
[-setfattr {-n name [-v value] | -x name} <path>]
[-setrep [-R] [-w] <rep> <path> ...]
[-stat [format] <path> ...]
[-tail [-f] <file>]
[-test -[defsz] <path>]
[-text [-ignoreCrc] <src> ...]
[-touchz <path> ...]
[-truncate [-w] <length> <path> ...]
[-usage [cmd ...]]
Generic options supported are
-conf <configuration file> specify an application configuration file
-D <property=value> use value for given property
-fs <local|namenode:port> specify a namenode
-jt <local|resourcemanager:port> specify a ResourceManager
-files <comma separated list of files> specify comma separated files to be copied to the map reduce cluster
-libjars <comma separated list of jars> specify comma separated jar files to include in the classpath.
-archives <comma separated list of archives> specify comma separated archives to be unarchived on the compute machines.
The general command line syntax is
笔者常用的命令如下:
hadoop fs -put local remote 上传本地文件到hdfs
hadoop fs -get remote local 下载hdfs上文件到本地
hadoop fs -ls path 查看hdfs上某目录下的文件(目录)
hadoop fs -cat remote 查看hdfs上文件
hadoop fs -mkdir [-p] (表示递归创建) path 在hdfs上创建目录
hadoop fs -rm -f -r path 删除hdfs上目录
hadoop fs -rm -f remote 删除hdfs上文件
hadoop fs -mv src dest hdfs上文件移动 (src和dest都是在hdfs上)
hadoop fs -cp src dest hdfs上文件复制(src和dest都是在hdfs上)
hdfs的java 客户端API 示例代码:
pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.tingcream</groupId>
<artifactId>hadoopStudy</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>hadoopStudy</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<!--hadoop-common -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.6</version>
<scope>provided</scope>
</dependency>
<!-- hadoop-hdfs -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.6</version>
</dependency>
<!-- hadoop-mapreduce-client-core -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.7.6</version>
</dependency>
<!-- hadoop-mapreduce-client-jobclient -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<version>2.7.6</version>
<scope>provided</scope>
</dependency>
<!-- hadoop-mapreduce-client-common -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-common</artifactId>
<version>2.7.6</version>
</dependency>
</dependencies>
</project>
HdfsDemo.java
package com.tingcream.hadoopStudy.hdfs;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.junit.Before;
import org.junit.Test;
public class HdfsDemo {
/*
* FileSystem是一个抽象类,其具体子类有DistributedFileSystem(HDFS文件系统)、FTPFileSystem(FTP文件系统)、
* RawLocalFileSystem(本地文件系统,对应linux中可为ext3、ext4,windows中为ntfs )
*/
private FileSystem fs = null;
@Before
public void init() throws Exception{
//读取classpath下的xxx-site.xml(core-site.xml 、hdfs-site.xml 等) 配置文件,并解析其内容,封装到conf对象中
Configuration conf = new Configuration();
/*
* 注意:在你本地(windows环境) 配置host文件,才能解析hadoopNode01和hadoopNode02 到对应ip
* 192.168.9.11 hadoopNode01
* 192.168.9.12 hadoopNode02
* */
//也可以在代码中对conf中的配置信息进行手动设置,会覆盖掉配置文件中的读取的值
conf.set("fs.defaultFS", "hdfs://hadoopNode01:9000/");
//根据配置信息,去获取一个具体文件系统的客户端操作实例对象 uri 配置文件 操作用户
fs = FileSystem.get(new URI("hdfs://hadoopNode01:9000/"),conf,"hadoop");
}
//上传文件
@Test
public void test1(){
try {
fs.copyFromLocalFile(new Path("d:/myImg/4.png"), new Path("/"));
//fs.copyFromLocalFile(new Path("d:/myImg/4.png"), new Path("hdfs://hadoopNode01:9000/"));
//fs.copyFromLocalFile(new Path("d:/myImg/4.png"), new Path("/4-2.png")); //上传同时重命名
//fs.copyFromLocalFile(new Path("d:/myImg/4.png"), new Path("hdfs://hadoopNode01:9000/4-3.png")); //上传同时重命名
System.out.println("ok");
} catch (Exception e) {
e.printStackTrace();
}
}
//下载文件
@Test
public void test2(){
try {
// ok
// fs.copyToLocalFile(false,new Path("/4.png"), new Path("e:/4.png"),true);
fs.copyToLocalFile(false,new Path("hdfs://hadoopNode01:9000/4.png"), new Path("e:/4.png"),true);
System.out.println("ok");
} catch (Throwable e) {
e.printStackTrace();
}
}
//创建目录ok 可递归创建多层次的目录
@Test
public void test3(){
try {
//创建目录ok
fs.mkdirs(new Path("/aaa/bbb/ccc"));
System.out.println("ok");
} catch (Exception e) {
e.printStackTrace();
}
}
//删除目录 ok true可递归删除多层次的目录
@Test
public void test4(){
try {
//第二个参数true表示递归删除(子目录、子文件)
boolean b =fs.delete(new Path("/aaa/bbb"), true);
System.out.println("删除成功:"+b);
} catch (Exception e) {
e.printStackTrace();
}
}
//列出目录中所有文件(夹) 仅当前目录中的不含子目录中的
@Test
public void test5(){
try {
System.out.println("---------------------------------");
FileStatus[] listStatus = fs.listStatus(new Path("/aaa/bbb"));
for(FileStatus status: listStatus){
String name = status.getPath().getName();
System.out.println(name + (status.isDirectory()?" 是目录":" 是文件"));
}
System.out.println("ok");
} catch (Exception e) {
e.printStackTrace();
}
}
}
HdfsDemo2.java
package com.tingcream.hadoopStudy.hdfs;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.net.URI;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.junit.Before;
import org.junit.Test;
public class HdfsDemo2 {
private FileSystem fs = null;
@Before
public void init() throws Exception{
//读取classpath下的xxx-site.xml(core-site.xml 、hdfs-site.xml 等) 配置文件,并解析其内容,封装到conf对象中
Configuration conf = new Configuration();
/*
* 注意:在你本地(windows环境) 配置host文件,才能解析hadoopNode01和hadoopNode02 到对应ip
* 192.168.9.11 hadoopNode01
* 192.168.9.12 hadoopNode02
* */
//也可以在代码中对conf中的配置信息进行手动设置,会覆盖掉配置文件中的读取的值
conf.set("fs.defaultFS", "hdfs://hadoopNode01:9000/");
//根据配置信息,去获取一个具体文件系统的客户端操作实例对象 uri 配置文件 操作用户
fs = FileSystem.get(new URI("hdfs://hadoopNode01:9000/"),conf,"hadoop");
}
/**
* 上传本地文件到hdfs pk
*/
@Test
public void test1(){
FSDataOutputStream out =null;
FileInputStream in =null;
try {
//本地文件流
in = new FileInputStream("e:/jdk-7u80-linux-x64.tar.gz");
//远程文件流
Path remote = new Path("/jdk-7u80-linux-x64.tar.gz");
out = fs.create(remote);
IOUtils.copy(in, out);//流复制
} catch (Exception e) {
e.printStackTrace();
}finally{
IOUtils.closeQuietly(in);
IOUtils.closeQuietly(out);
}
}
/**
* 从hdfs下载文件到本地 ok
*/
@Test
public void test2(){
FSDataInputStream in = null;
FileOutputStream out =null;
try {
//远程文件流
in =fs.open(new Path("/jdk-7u80-linux-x64.tar.gz"));
//本地文件流
out = new FileOutputStream("d:/jdk-7u80-linux-x64.tar.gz");
IOUtils.copy(in, out);//流复制
} catch (Exception e) {
e.printStackTrace();
}finally{
IOUtils.closeQuietly(in);
IOUtils.closeQuietly(out);
}
}
}
log4j.properties
log4j.rootLogger=DEBUG,stdout
log4j.appender.stdout = org.apache.log4j.ConsoleAppender
log4j.appender.stdout.Target = System.out
log4j.appender.stdout.Threshold = DEBUG
log4j.appender.stdout.layout = org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d [%t] %-5p [%c] - %m%n
注:如果你运行java 代码报错
Exceptionin thread "main" java.lang.UnsatisfiedLinkError:org.apache.hadoop.util.NativeCrc32.nativeCompute
....
你需要将windows本地环境编译的dll (hadoop.dll 和winutils.exe)安装到本地。
hadoop-2.7.6中win64的hadoop.dll、winutils.exe 百度云盘下载 :
链接:https://pan.baidu.com/s/1prVE5qPWNi5jwd0hMShX2g 密码:mwst
下载完成后,将hadoop.dll、winutils.exe放入到c:\windows\system32\ 目录中,重启电脑
再尝试运行上面的java代码,发现一切正常了。