Hadoop基础

1. 环境说明
  • 系统:CentOS 6.5
  • 外网: 已链接
  • Hadoop

    • 版本2.8.1
    • 单节点伪分布
    • root用户下可成功启动dfs服务
    • 工作目录和初始权限

      [root@hadoop000 software]# pwd
      /opt/software
      [root@hadoop000 software]# ll
      total 414612
      drwxrwxr-x. 10 root root      4096 May 29 02:52 hadoop-2.8.1
      -rw-r--r--.  1 root root 424555111 May 29 01:05 hadoop-2.8.1.tar.gz
      [root@hadoop000 software]# ll hadoop-2.8.1
      total 152
      drwxrwxr-x. 2 root root  4096 Jun  2  2017 bin
      drwxrwxr-x. 3 root root  4096 Jun  2  2017 etc
      drwxrwxr-x. 2 root root  4096 Jun  2  2017 include
      drwxrwxr-x. 3 root root  4096 Jun  2  2017 lib
      drwxrwxr-x. 2 root root  4096 Jun  2  2017 libexec
      -rw-rw-r--. 1 root root 99253 Jun  2  2017 LICENSE.txt
      drwxr-xr-x. 2 root root  4096 May 29 03:55 logs
      -rw-rw-r--. 1 root root 15915 Jun  2  2017 NOTICE.txt
      -rw-r--r--. 1 root root  1366 Jun  2  2017 README.txt
      drwxrwxr-x. 2 root root  4096 Jun  2  2017 sbin
      drwxrwxr-x. 4 root root  4096 Jun  2  2017 share
2. 创建hadoop用户,并给予ssh授信。
  • 目的:测试中我们使用root用户,但在开发尤其再生产中更过的只是使用普通用户进行操作,我们创立一个hadoop用户,给予适当权限,进行续续的学习。过程中如有需要则临时切换回root用户。学习初始。
  • 命令:

    
    # 清楚root用下的hadoop相关数据
    
    [root@hadoop000 software]# cd /tmp/
    [root@hadoop000 tmp]# ll
    total 68
    drwxr-xr-x. 4 root root 4096 May 29 02:25 hadoop-root
    -rw-r--r--. 1 root root    6 May 29 03:55 hadoop-root-datanode.pid
    -rw-r--r--. 1 root root    6 May 29 03:55 hadoop-root-namenode.pid
    -rw-r--r--. 1 root root    6 May 29 03:55 hadoop-root-secondarynamenode.pid
    drwxr-xr-x. 2 root root 4096 May 29 03:56 hsperfdata_root
    drwxr-xr-x. 3 root root 4096 May 29 03:55 Jetty_0_0_0_0_50070_hdfs____w2cu08
    drwxr-xr-x. 3 root root 4096 May 29 03:55 Jetty_0_0_0_0_50090_secondary____y6aanv
    drwxr-xr-x. 3 root root 4096 May 29 03:55 Jetty_localhost_38800_datanode____.je4w6e
    drwxr-xr-x. 3 root root 4096 May 29 02:52 Jetty_localhost_56524_datanode____2cxwwt
    drwx------. 2 root root 4096 May 28 23:54 keyring-HHuTDE
    drwx------. 2 gdm  gdm  4096 May 28 23:54 orbit-gdm
    drwx------. 2 root root 4096 May 29 00:01 orbit-root
    drwx------. 2 root root 4096 May 28 23:54 pulse-sVqNyeEmgdGT
    drwx------. 2 gdm  gdm  4096 May 28 23:54 pulse-VJswktkPFPh8
    drwx------. 2 root root 4096 May 28 23:54 virtual-root.mg0ADE
    -rw-r--r--. 1 root root    6 May 29 02:25 yarn-root-nodemanager.pid
    -rw-r--r--. 1 root root    6 May 29 02:25 yarn-root-resourcemanager.pid
    -rw-------. 1 root root    0 May 28 23:44 yum.log
    [root@hadoop000 tmp]# kill -9 $(pgrep -f hadoop) #关闭hadoop进程
    [root@hadoop000 tmp]# rm -rf hadoop-* hsperfdata*
    [root@hadoop000 tmp]# ll
    total 48
    drwxr-xr-x. 3 root root 4096 May 29 03:55 Jetty_0_0_0_0_50070_hdfs____w2cu08
    drwxr-xr-x. 3 root root 4096 May 29 03:55 Jetty_0_0_0_0_50090_secondary____y6aanv
    drwxr-xr-x. 3 root root 4096 May 29 03:55 Jetty_localhost_38800_datanode____.je4w6e
    drwxr-xr-x. 3 root root 4096 May 29 02:52 Jetty_localhost_56524_datanode____2cxwwt
    drwx------. 2 root root 4096 May 28 23:54 keyring-HHuTDE
    drwx------. 2 gdm  gdm  4096 May 28 23:54 orbit-gdm
    drwx------. 2 root root 4096 May 29 00:01 orbit-root
    drwx------. 2 root root 4096 May 28 23:54 pulse-sVqNyeEmgdGT
    drwx------. 2 gdm  gdm  4096 May 28 23:54 pulse-VJswktkPFPh8
    drwx------. 2 root root 4096 May 28 23:54 virtual-root.mg0ADE
    -rw-r--r--. 1 root root    6 May 29 02:25 yarn-root-nodemanager.pid
    -rw-r--r--. 1 root root    6 May 29 02:25 yarn-root-resourcemanager.pid
    -rw-------. 1 root root    0 May 28 23:44 yum.log
    
    
    
    
    # 创建新用户
    
    [root@hadoop000 tmp]# cd -
    /opt/software
    [root@hadoop000 software]# useradd hadoop
    [root@hadoop000 software]# passwd hadoop 
    Changing password for user hadoop.
    New password: 
    Retype new password: 
    passwd: all authentication tokens updated successfully.
    [root@hadoop000 software]# id hadoop
    uid=501(hadoop) gid=501(hadoop) groups=501(hadoop)
    [root@hadoop000 software]# chown -R hadoop:hadoop hadoop-2.8.1
    
    
    
    # ssh 授信新用户
    
    [root@hadoop000 ~]$ su - hadoop
    Password: 
    [hadoop@hadoop000 ~]$ ll .ssh
    ls: cannot access .ssh: No such file or directory
    [hadoop@hadoop000 ~]$ ssh-keygen
    Generating public/private rsa key pair.
    Enter file in which to save the key (/home/hadoop/.ssh/id_rsa): 
    .....
    .....
    The key's randomart image is:
    +--[ RSA 2048]----+
    |                 |
    |                 |
    |                 |
    |   . . .         |
    |. . + . S        |
    |.+ . . =         |
    |o=o   o .        |
    |E=.              |
    |&Bo              |
    +-----------------+
    [hadoop@hadoop000 ~]$ cd .ssh
    [hadoop@hadoop000 .ssh]$ ll
    total 8
    -rw-------. 1 hadoop hadoop 1675 May 29 04:37 id_rsa
    -rw-r--r--. 1 hadoop hadoop  398 May 29 04:37 id_rsa.pub
    [hadoop@hadoop000 .ssh]$ cat id_rsa.pub >> authorized_keys
    [hadoop@hadoop000 .ssh]$ ll
    total 12
    -rw-rw-r--. 1 hadoop hadoop  398 May 29 04:38 authorized_keys
    -rw-------. 1 hadoop hadoop 1675 May 29 04:37 id_rsa
    -rw-r--r--. 1 hadoop hadoop  398 May 29 04:37 id_rsa.pub
    [hadoop@hadoop000 .ssh]$ chmod 600 authorized_keys #按官网改密钥权限为600
    
    
    
    # 修改Datenode的启动地址为本地IP
    
    [hadoop@hadoop000 .ssh]$ cd /opt/software/hadoop-2.8.1
    [hadoop@hadoop000 hadoop-2.8.1]$ cd etc/hadoop/
    [hadoop@hadoop000 hadoop]$ ll slaves 
    -rw-rw-r--. 1 hadoop hadoop 10 Jun  2  2017 slaves
    [hadoop@hadoop000 hadoop]$ vi slaves
    192.168.137.138
    :wq
    "slaves" 1L, 16C written   
    [hadoop@hadoop000 etc]$ cd ../..
    [hadoop@hadoop000 hadoop-2.8.1]$  
    
    
    #修改SecondaryNamenode的启动地址为本机
    
    [hadoop@hadoop000 hadoop]$ vi hdfs-site.xml 
    ....
    <!-- Put site-specific property overrides in this file. -->
    <configuration>
        <property>
                <name>dfs.replication</name>
                <value>1</value>
        </property>
    
        <property>
                <name>dfs.namenode.secondary.http-address</name>
                <value>192.168.137.130:50090</value>
        </property>
    
        <property>
                <name>dfs.namenode.secondary.https-address</name>
                <value>192.168.137.130:50091</value>
        </property>
    </configuration>
    :wq!
    
    
    
    # 保险起见,再此进入/tmp目录杀掉与hadoop有关临时文件,避免冲突
    
    [hadoop@hadoop000 tmp]$ rm -rf /tmp/*hadoop*
    
    
    #修改 hadoop 临时文件夹为非公用临时文件夹,避免自动清理
    
    [hadoop@hadoop000 hadoop]$ vi core-site.xml 
    ....
    <!-- Put site-specific property overrides in this file. -->
    
    <configuration>
        <property>
                <name>fs.defaultFS</name>
                <value>hdfs://hadoop000:9000</value>
        </property>
    
        <property>
                    <name>hadoop.tmp.dir</name>
                    <value>/opt/software/hadoop-2.8.1/hadoop-tmp</value>
        </property>
    
    </configuration
    :wq!
    
    
    
    # 格式化hdfs所有服务
    
    [hadoop@hadoop000 tmp]$ hdfs namenode hadoop
    
    /************************************************************
    SHUTDOWN_MSG: Shutting down NameNode at hadoop000/192.168.137.138
    ************************************************************/
    [hadoop@hadoop000 tmp]$ cd /opt/software/hadoop-2.8.1
    
    
    
    # 第一次启动并授信 
    
    [hadoop@hadoop000 hadoop-2.8.1]$ sbin/start-dfs.sh
    Starting namenodes on [hadoop000]
    hadoop000: starting namenode, logging to /opt/software/hadoop-2.8.1/logs/hadoop-hadoop-namenode-hadoop000.out
    localhost: starting datanode, logging to /opt/software/hadoop-2.8.1/logs/hadoop-hadoop-datanode-hadoop000.out
    Starting secondary namenodes [0.0.0.0]
    The authenticity of host '0.0.0.0 (0.0.0.0)' can't be established.
    RSA key fingerprint is 9c:c2:02:1a:66:96:da:a4:ca:59:0e:c5:a7:6c:11:dd.
    Are you sure you want to continue connecting (yes/no)? yes
    0.0.0.0: Warning: Permanently added '0.0.0.0' (RSA) to the list of known hosts.
    0.0.0.0: starting secondarynamenode, logging to /opt/software/hadoop-2.8.1/logs/hadoop-hadoop-secondarynamenode-hadoop000.out
    
    #确认启动完整
    
    [hadoop@hadoop000 hadoop-2.8.1]$ jps
    63953 NameNode
    64380 Jps
    64269 SecondaryNameNode
    64094 DataNode
    
3. YARN/MapReduce的伪分布式部署
  • 目的:此为Hadoop的重要主件。
  • 命令:

    
    # 部署MapReduce
    
    [hadoop@hadoop000 hadoop]$ pwd
    /opt/software/hadoop-2.8.1/etc/hadoop
    [hadoop@hadoop000 hadoop]$ vi mapred-site.xml
    ....
    <!-- Put site-specific property overrides in this file. -->
    <configuration>
        <property>
                <name>mapreduce.framework.name</name>
                <value>yarn</value>
        </property>
    </configuration>
    :wq!
    
    
    # 部署MapReduce
    
    [hadoop@hadoop000 hadoop]$ vi yarn-site.xml 
    ...... 
    <configuration>
        <property>
                <name>yarn.nodemanager.aux-services</name>
                <value>mapreduce_shuffle</value>
        </property>
    </configuration>
    :wq!
    
    
    #确认能启动所有hadoop组件
    
    [hadoop@hadoop000 hadoop-2.8.1]$ pwd
    /opt/software/hadoop-2.8.1
    [hadoop@hadoop000 hadoop-2.8.1]$ sbin/start-all.sh 
    This script is Deprecated. Instead use start-dfs.sh and start-yarn.sh
    Starting namenodes on [hadoop000]
    hadoop000: starting namenode, logging to /opt/software/hadoop-2.8.1/logs/hadoop-hadoop-namenode-hadoop000.out
    hadoop000: starting datanode, logging to /opt/software/hadoop-2.8.1/logs/hadoop-hadoop-datanode-hadoop000.out
    Starting secondary namenodes [hadoop000]
    hadoop000: starting secondarynamenode, logging to /opt/software/hadoop-2.8.1/logs/hadoop-hadoop-secondarynamenode-hadoop000.out
    starting yarn daemons
    starting resourcemanager, logging to /opt/software/hadoop-2.8.1/logs/yarn-hadoop-resourcemanager-hadoop000.out
    hadoop000: starting nodemanager, logging to /opt/software/hadoop-2.8.1/logs/yarn-hadoop-nodemanager-hadoop000.out
    [hadoop@hadoop000 hadoop-2.8.1]$ jps
    4644 NodeManager
    4165 DataNode
    4053 NameNode
    4534 ResourceManager
    4760 Jps
    4376 SecondaryNameNode

    看到所有组件都启动成功后,可以在windows端打开UI页面,其中hdfs的端口号为50070,YARN的端口号为8088。 对页面内容的具体解读在后续中进行。
    这里写图片描述

4. 测试hdfs主要命令
  • 目的: hdfs相当于在CentOS中重新部署了一个操作系统,使用hadoop功能,必须将目标文件放置到hdfs这个寄生系统中,于是文件的增删改查是学习hadoop的基础步骤。其中文件内容在hdfs中修改不做描述,无意义。
  • 命令:
# 查看
## 查看目录下的子目录和文件,以为根目录为例
[hadoop@hadoop000 ~]$ hadoop fs -ls /
Found 2 items
drwx------   - hadoop supergroup          0 2018-06-10 04:40 /tmp
drwxr-xr-x   - hadoop supergroup          0 2018-06-10 04:40 /user

##查看母目录下的多层目录和文件, 以根目录为例
[hadoop@hadoop000 ~]$ hadoop fs -ls -R /
drwx------   - hadoop supergroup          0 2018-06-10 04:40 /tmp
drwx------   - hadoop supergroup          0 2018-06-10 04:40 
....
.....
drwxr-xr-x   - hadoop supergroup          0 2018-06-10 04:40 /user
drwxr-xr-x   - hadoop supergroup          0 2018-06-13 00:03 /user/hadoop

## 查看某个文件的内容
[hadoop@hadoop000 ~]$ hadoop fs -cat /tmp/hadoop-yarn/staging/history/done_intermediate/hadoop/job_1528576803667_0005_conf.xml
....
....
<property><name>mapreduce.jobhistory.intermediate-done-dir</name><value>${yarn.app.mapreduce.am.staging-dir}/history/done_intermediate</value><source>mapred-default.xml</source><source>job.xml</source></property>
<property><name>fs.s3a.attempts.maximum</name><value>20</value><source>core-default.xml</source><source>job.xml</source></property>
</configuration>


## 查看某个文件最后1k的内容
[hadoop@hadoop000 ~]$ hadoop fs -tail /tmp/hadoop-yarn/staging/history/done_intermediate/hadoop/job_1528576803667_0005_conf.xml
....
....
<property><name>mapreduce.jobhistory.intermediate-done-dir</name><value>${yarn.app.mapreduce.am.staging-dir}/history/done_intermediate</value><source>mapred-default.xml</source><source>job.xml</source></property>
<property><name>fs.s3a.attempts.maximum</name><value>20</value><source>core-default.xml</source><source>job.xml</source></property>
</configuration>

##查目录下子目录和文件的大小
[hadoop@hadoop000 hdfs-bsp]$ hadoop fs -du /
540011  /tmp
0       /user
178     /worldcount
## 某个目录的总大小
[hadoop@hadoop000 hdfs-bsp]$ hadoop fs -du -s /
540189  /

## 以为human-readable的形式显示
[hadoop@hadoop000 hdfs-bsp]$ hadoop fs -du -s -h /
527.5 K  /

## 查看目录下的**子孙等**目录个数, 文件个数, 以及总大小
[hadoop@hadoop000 hdfs-bsp]$ hadoop fs -count /
          15           20             540189 /




# 创建目录/文件
##创建文件夹
[hadoop@hadoop000 ~]$ hadoop fs -mkdir -p /worldcount/input
[hadoop@hadoop000 ~]$ hadoop fs -ls -R /
....
.....
drwxr-xr-x   - hadoop supergroup          0 2018-06-13 00:40 /worldcount
drwxr-xr-x   - hadoop supergroup          0 2018-06-13 00:40 /worldcount/input
## 创建文件
[hadoop@hadoop000 hdfs-bsp]$ hadoop fs -put - /hallowd ##键盘输入创建hallowd
hallo 
world  ## Crtl+D结束输入
[hadoop@hadoop000 hdfs-bsp]$ hadoop fs -cat /hallowd
hallo
world

#将CentOS中目录/文件放到hdfs中去
[hadoop@hadoop000 hdfs-bsp]$ pwd
/home/hadoop/hdfs-bsp
[hadoop@hadoop000 hdfs-bsp]$ ll
total 16
-rw-rw-r--. 1 hadoop hadoop 12 Jun 13 00:44 china
-rw-rw-r--. 1 hadoop hadoop 12 Jun 13 00:44 halloworld.log
-rw-rw-r--. 1 hadoop hadoop 12 Jun 13 00:45 henan
-rw-rw-r--. 1 hadoop hadoop 14 Jun 13 00:45 huixian
## 放一个文件
[hadoop@hadoop000 hdfs-bsp]$ hadoop fs -put /home/hadoop/hdfs-bsp/china /worldcount/input/
[hadoop@hadoop000 hdfs-bsp]$ hadoop fs -ls -R /worldcount
drwxr-xr-x   - hadoop supergroup          0 2018-06-13 00:48 /worldcount/input
-rw-r--r--   1 hadoop supergroup         12 2018-06-13 00:48 /worldcount/input/china
## 放整个文件夹
[hadoop@hadoop000 hdfs-bsp]$ hadoop fs -put /home/hadoop/hdfs-bsp/ /worldcount/input/
[hadoop@hadoop000 hdfs-bsp]$ hadoop fs -ls -R /worldcount
drwxr-xr-x   - hadoop supergroup          0 2018-06-13 00:49 /worldcount/input
-rw-r--r--   1 hadoop supergroup         12 2018-06-13 00:48 /worldcount/input/china
drwxr-xr-x   - hadoop supergroup          0 2018-06-13 00:49 /worldcount/input/hdfs-bsp
-rw-r--r--   1 hadoop supergroup         12 2018-06-13 00:49 /worldcount/input/hdfs-bsp/china
-rw-r--r--   1 hadoop supergroup         12 2018-06-13 00:49 /worldcount/input/hdfs-bsp/halloworld.log
-rw-r--r--   1 hadoop supergroup         12 2018-06-13 00:49 /worldcount/input/hdfs-bsp/henan
-rw-r--r--   1 hadoop supergroup         14 2018-06-13 00:49 /worldcount/input/hdfs-bsp/huixian

# 从hdfs获取文件/目录到CentOS
## 从hdfs中传文件到CentOS, 目录类似
[hadoop@hadoop000 hdfs-bsp]$ hadoop fs -get /worldcount/input/hdfs-bsp/china  /home/hadoop/hdfs-bsp/china1.log
[hadoop@hadoop000 hdfs-bsp]$ cat /home/hadoop/hdfs-bsp/china1.log
hallo china
## 合并hdfs中某个目录下的文件并送到CentOS中去
[hadoop@hadoop000 hdfs-bsp]$ hadoop fs -getmerge -nl /worldcount/input/hdfs-bsp/ /home/hadoop/hdfs-bsp/merge.log
[hadoop@hadoop000 hdfs-bsp]$ cat /home/hadoop/hdfs-bsp/merge.log
hallo china

hallo world

hallo henan

hallo huixian

# 删除hdfs中的文件/目录
## 删除文件
[hadoop@hadoop000 hdfs-bsp]$ hadoop fs -rm /worldcount/input/china
Deleted /worldcount/input/china
[hadoop@hadoop000 hdfs-bsp]$ hadoop fs -ls -R /worldcount
drwxr-xr-x   - hadoop supergroup          0 2018-06-13 00:54 /worldcount/input
drwxr-xr-x   - hadoop supergroup          0 2018-06-13 00:49 /worldcount/input/hdfs-bsp
-rw-r--r--   1 hadoop supergroup         12 2018-06-13 00:49 /worldcount/input/hdfs-bsp/china
-rw-r--r--   1 hadoop supergroup         12 2018-06-13 00:49 /worldcount/input/hdfs-bsp/halloworld.log
-rw-r--r--   1 hadoop supergroup         12 2018-06-13 00:49 /worldcount/input/hdfs-bsp/henan
-rw-r--r--   1 hadoop supergroup         14 2018-06-13 00:49 /worldcount/input/hdfs-bsp/huixian


## 删除目录
[hadoop@hadoop000 hdfs-bsp]$ hadoop fs -rm -r /worldcount/input/hdfs-bsp/
Deleted /worldcount/input/hdfs-bsp
[hadoop@hadoop000 hdfs-bsp]$ hadoop fs -ls -R /worldcount
drwxr-xr-x   - hadoop supergroup          0 2018-06-13 00:56 /worldcount/input


# 复制/移动hdfs中的文件/目录
##文件结构
[hadoop@hadoop000 hdfs-bsp]$ hadoop fs -ls -R /worldcount/input/hdfs-bsp/
-rw-r--r--   1 hadoop supergroup         12 2018-06-13 01:08 /worldcount/input/hdfs-bsp/china
-rw-r--r--   1 hadoop supergroup         12 2018-06-13 01:08 /worldcount/input/hdfs-bsp/china1.log
-rw-r--r--   1 hadoop supergroup         12 2018-06-13 01:08 /worldcount/input/hdfs-bsp/halloworld.log
-rw-r--r--   1 hadoop supergroup         12 2018-06-13 01:08 /worldcount/input/hdfs-bsp/henan
-rw-r--r--   1 hadoop supergroup         14 2018-06-13 01:08 /worldcount/input/hdfs-bsp/huixian
-rw-r--r--   1 hadoop supergroup         54 2018-06-13 01:08 /worldcount/input/hdfs-bsp/merge.log
## 移动文件, 目录类似
[hadoop@hadoop000 hdfs-bsp]$ hadoop fs -mv /worldcount/input/hdfs-bsp/merge.log /worldcount/input/
## 复制目录, 文件类似
[hadoop@hadoop000 hdfs-bsp]$ hadoop fs -cp /worldcount/input/hdfs-bsp/ /worldcount/input/cp-bsp
## 结果
[hadoop@hadoop000 hdfs-bsp]$ hadoop fs -ls -R /worldcount/input/
drwxr-xr-x   - hadoop supergroup          0 2018-06-13 01:13 /worldcount/input/cp-bsp
-rw-r--r--   1 hadoop supergroup         12 2018-06-13 01:13 /worldcount/input/cp-bsp/china
-rw-r--r--   1 hadoop supergroup         12 2018-06-13 01:13 /worldcount/input/cp-bsp/china1.log
-rw-r--r--   1 hadoop supergroup         12 2018-06-13 01:13 /worldcount/input/cp-bsp/halloworld.log
-rw-r--r--   1 hadoop supergroup         12 2018-06-13 01:13 /worldcount/input/cp-bsp/henan
-rw-r--r--   1 hadoop supergroup         14 2018-06-13 01:13 /worldcount/input/cp-bsp/huixian
drwxr-xr-x   - hadoop supergroup          0 2018-06-13 01:13 /worldcount/input/hdfs-bsp
-rw-r--r--   1 hadoop supergroup         12 2018-06-13 01:08 /worldcount/input/hdfs-bsp/china
-rw-r--r--   1 hadoop supergroup         12 2018-06-13 01:08 /worldcount/input/hdfs-bsp/china1.log
-rw-r--r--   1 hadoop supergroup         12 2018-06-13 01:08 /worldcount/input/hdfs-bsp/halloworld.log
-rw-r--r--   1 hadoop supergroup         12 2018-06-13 01:08 /worldcount/input/hdfs-bsp/henan
-rw-r--r--   1 hadoop supergroup         14 2018-06-13 01:08 /worldcount/input/hdfs-bsp/huixian
-rw-r--r--   1 hadoop supergroup         54 2018-06-13 01:08 /worldcount/input/merge.log


# 设置文件/目录的副本个数
## 以为文件为例,设置副本为3。目录类似, with -R
[hadoop@hadoop000 hdfs-bsp]$ hadoop fs -setrep  3 /hallowd
Replication 3 set: /hallowd
[hadoop@hadoop000 hdfs-bsp]$ hadoop fs -ls /
Found 4 items
##该文件副本数已经变为3, **为注释
-rw-r--r--   *3* hadoop supergroup         12 2018-06-13 01:32 /hallowd
drwx------   - hadoop supergroup          0 2018-06-10 04:40 /tmp
drwxr-xr-x   - hadoop supergroup          0 2018-06-10 04:40 /user
drwxr-xr-x   - hadoop supergroup          0 2018-06-13 00:40 /worldcount
5. MapReduce使用举例

目的: MapReduce是计算框架,Hadoop中自带了一些计算程序,举例使用
命令:

# 找到案例文件
[hadoop@hadoop000 hadoop-2.8.1]$ pwd
/opt/software/hadoop-2.8.1
[hadoop@hadoop000 hadoop-2.8.1]$ find ./ -name "*example*"
./share/hadoop/mapreduce/sources/hadoop-mapreduce-examples-2.8.1-sources.jar
./share/hadoop/mapreduce/sources/hadoop-mapreduce-examples-2.8.1-test-sources.jar
./share/hadoop/mapreduce/hadoop-mapreduce-examples-2.8.1.jar
./share/hadoop/mapreduce/lib-examples
./lib/native/examples
./etc/hadoop/ssl-server.xml.example
./etc/hadoop/ssl-client.xml.example
# 查看有哪些程序以及使用命令
[hadoop@hadoop000 hadoop-2.8.1]$ yarn jar ./share/hadoop/mapreduce/hadoop-mapreduce-examples-2.8.1.jar
An example program must be given as the first argument. 
Valid program names are: # 使用时先输入程序命,有以下程序
  ....
  pi: A map/reduce program that estimates Pi using a quasi-Monte Carlo method. ## 计算pi的
  .....
  wordcount: A map/reduce program that counts the words in the input files.  ## 计算单词数的程序
  .....



# Pi的例子
[hadoop@hadoop000 hadoop-2.8.1]$ yarn jar ./share/hadoop/mapreduce/hadoop-mapreduce-examples-2.8.1.jar pi 
Usage: org.apache.hadoop.examples.QuasiMonteCarlo <nMaps> <nSamples> # 解释需要什么参数
[hadoop@hadoop000 hadoop-2.8.1]$ yarn jar ./share/hadoop/mapreduce/hadoop-mapreduce-examples-2.8.1.jar pi 2 5
Number of Maps  = 2
Samples per Map = 5
Wrote input for Map #0
Wrote input for Map #1
Starting Job
18/06/13 01:56:22 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
18/06/13 01:56:23 INFO input.FileInputFormat: Total input files to process : 2
......
18/06/13 01:56:24 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1528816763742_0001 
### job_数字,数字为按秒计的绝对时间(好像从1970.10.1开始?),可转换
....
## 耗时和结果
Job Finished in 46.943 seconds
Estimated value of Pi is 3.60000000000000000000



# 计算单词频数的程序
[hadoop@hadoop000 hadoop-2.8.1]$ yarn jar ./share/hadoop/mapreduce/hadoop-mapreduce-examples-2.8.1.jar wordcount
## 提示需要输入input目录 和 output目录作为参数
Usage: wordcount <in> [<in>...] <out>
[hadoop@hadoop000 hadoop-2.8.1]$ yarn jar ./share/hadoop/mapreduce/hadoop-mapreduce-examples-2.8.1.jar wordcount /worldcount/input/hdfs-bsp /worldcount/output-wc/
.....
.....
## 计算中没有错误发生
        Shuffle Errors
                BAD_ID=0
                CONNECTION=0
                IO_ERROR=0
                WRONG_LENGTH=0
                WRONG_MAP=0
                WRONG_REDUCE=0
        File Input Format Counters 
                Bytes Read=62
        File Output Format Counters 
                Bytes Written=42
## 查看结果目录
[hadoop@hadoop000 hadoop-2.8.1]$ hadoop fs -ls /worldcount/output-wc/
Found 2 items
-rw-r--r--   1 hadoop supergroup          0 2018-06-13 02:05 /worldcount/output-wc/_SUCCESS # 标记成功
-rw-r--r--   1 hadoop supergroup         42 2018-06-13 02:05 /worldcount/output-wc/part-r-00000 #结果内容
[hadoop@hadoop000 hadoop-2.8.1]$ hadoop fs -cat /worldcount/output-wc/part-r-00000
china   2
hallo   5
henan   1
huixian 1
world   1

同时我们可以在Web端打开YARN的页面查看工作进程以及结果
这里写图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值