搭建Hadoop pseudo-distributed 环境

sun_xo

已于 2022-07-01 16:32:45 修改

阅读量195

点赞数

文章标签： hadoop hdfs yarn

于 2022-06-01 16:25:02 首次发布

本文链接：https://blog.csdn.net/sun_xo/article/details/125084236

版权

0) My environment
macOS Big Sur Version 11.6
JDK jdk1.8

1) Download and install
## download hadoop-2.8.0.tar.gz from http://archive.apache.org/dist/
## unpack to ~/work/hadoop
## set environment variables
$ cd ~/work/hadoop
$ export HADOOP_HOME=~/work/hadoop
$ export PATH=$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH

2) configure and start
## modify configuration for pseudo-dist
$ cd ~/work/hadoop/etc/hadoop
$ diff -u hadoop-env.sh.orig hadoop-env.sh

--- hadoop-env.sh.orig	2017-03-17 13:31:33.000000000 +0800
+++ hadoop-env.sh	2022-05-16 21:50:41.000000000 +0800
@@ -22,7 +22,7 @@
 # remote nodes.
 
 # The java implementation to use.
-export JAVA_HOME=${JAVA_HOME}
+export JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk1.8.0_321.jdk/Contents/Home
 
 # The jsvc implementation to use. Jsvc is required to run secure datanodes

$ diff -u core-site.xml.orig core-site.xml

--- core-site.xml.orig	2017-03-17 13:31:33.000000000 +0800
+++ core-site.xml	2022-06-07 16:43:09.000000000 +0800
@@ -17,4 +17,21 @@
 <!-- Put site-specific property overrides in this file. -->
 
 <configuration>
+    <property>
+        <name>fs.defaultFS</name>
+        <value>hdfs://localhost:9000</value>
+    </property>
+    <property>
+        <name>hadoop.tmp.dir</name>
+        <value>/Users/sun_xo/work/hadoop/data/tmp</value>
+    </property>
+    <!-- OOZIE -->
+    <property>
+        <name>hadoop.proxyuser.sun_xo.hosts</name>
+        <value>*</value>
+    </property>
+    <property>
+        <name>hadoop.proxyuser.sun_xo.groups</name>
+        <value>*</value>
+    </property>
 </configuration>

$ diff -u hdfs-site.xml.orig hdfs-site.xml

--- hdfs-site.xml.orig	2017-03-17 13:31:36.000000000 +0800
+++ hdfs-site.xml	2022-05-28 11:22:22.000000000 +0800
@@ -17,5 +17,12 @@
 <!-- Put site-specific property overrides in this file. -->
 
 <configuration>
-
+    <property>
+        <name>dfs.replication</name>
+        <value>1</value>
+    </property>
 </configuration>

## format hdfs
$ hdfs namenode -format

## start hadoop as pseudo-dist
$ cd ~/work/hadoop
$ mr.sh start

## create home on hdfs
$ hdfs dfs -mkdir -p /user/sunxo

## modify for pseudo-dist with yarn
$ cd ~/work/hadoop/etc/hadoop
$ diff -u yarn-site.xml.orig yarn-site.xml

--- yarn-site.xml.orig	2017-03-17 13:31:42.000000000 +0800
+++ yarn-site.xml	2022-05-17 09:09:40.000000000 +0800
@@ -13,7 +13,20 @@
   limitations under the License. See accompanying LICENSE file.
 -->
 <configuration>
-
-<!-- Site specific YARN configuration properties -->
-
+    <property>
+        <name>yarn.resourceManager.hostname</name>
+        <value>localhost</value>
+    </property>
+    <property>
+        <name>yarn.nodemanager.aux-services</name>
+        <value>mapreduce_shuffle</value>
+    </property>
+    <property>
+        <name>yarn.log.server.url</name>
+        <value>http://localhost:19888/jobhistory/logs</value>
+    </property>
+    <property>
+        <name>yarn.log-aggregation-enable</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>yarn.log-aggregation.retain-seconds</name>
+        <value>604800</value>
+    </property>
 </configuration>

$ diff -u mapred-site.xml.template mapred-site.xml

$ diff -u mapred-site.xml.template mapred-site.xml
--- mapred-site.xml.template	2017-03-17 13:31:46.000000000 +0800
+++ mapred-site.xml	2022-05-16 14:52:39.000000000 +0800
@@ -17,5 +17,8 @@
 <!-- Put site-specific property overrides in this file. -->
 
 <configuration>
-
+    <property>
+        <name>mapreduce.framework.name</name>
+        <value>yarn</value>
+    </property>
 </configuration>

## start hadoop as pseudo-dist yarn
$ cd ~/work/hadoop
$ mr.sh start yarn
$ jps
26976 NameNode
27378 NodeManager
27427 JobHistoryServer
27061 DataNode
27159 SecondaryNameNode
27295 ResourceManager

3) run a mapreduce case
$ mr.sh pseudo_dist

pseudo_dist 
22/06/09 20:47:26 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Deleted input
22/06/09 20:47:28 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/06/09 20:47:29 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/06/09 20:47:30 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Deleted output
22/06/09 20:47:32 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/06/09 20:47:32 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
22/06/09 20:47:33 INFO input.FileInputFormat: Total input files to process : 1
22/06/09 20:47:33 INFO mapreduce.JobSubmitter: number of splits:1
22/06/09 20:47:33 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1654591879954_0011
22/06/09 20:47:33 INFO impl.YarnClientImpl: Submitted application application_1654591879954_0011
22/06/09 20:47:33 INFO mapreduce.Job: The url to track the job: http://localhost:8088/proxy/application_1654591879954_0011/
22/06/09 20:47:33 INFO mapreduce.Job: Running job: job_1654591879954_0011
22/06/09 20:47:39 INFO mapreduce.Job: Job job_1654591879954_0011 running in uber mode : false
22/06/09 20:47:39 INFO mapreduce.Job:  map 0% reduce 0%
22/06/09 20:47:44 INFO mapreduce.Job:  map 100% reduce 0%
22/06/09 20:47:48 INFO mapreduce.Job:  map 100% reduce 100%
22/06/09 20:47:48 INFO mapreduce.Job: Job job_1654591879954_0011 completed successfully
22/06/09 20:47:49 INFO mapreduce.Job: Counters: 49
	File System Counters
		FILE: Number of bytes read=104
		FILE: Number of bytes written=273663
		FILE: Number of read operations=0
		FILE: Number of large read operations=0
		FILE: Number of write operations=0
		HDFS: Number of bytes read=175
		HDFS: Number of bytes written=66
		HDFS: Number of read operations=6
		HDFS: Number of large read operations=0
		HDFS: Number of write operations=2
	Job Counters 
		Launched map tasks=1
		Launched reduce tasks=1
		Rack-local map tasks=1
		Total time spent by all maps in occupied slots (ms)=1818
		Total time spent by all reduces in occupied slots (ms)=1834
		Total time spent by all map tasks (ms)=1818
		Total time spent by all reduce tasks (ms)=1834
		Total vcore-milliseconds taken by all map tasks=1818
		Total vcore-milliseconds taken by all reduce tasks=1834
		Total megabyte-milliseconds taken by all map tasks=1861632
		Total megabyte-milliseconds taken by all reduce tasks=1878016
	Map-Reduce Framework
		Map input records=1
		Map output records=10
		Map output bytes=103
		Map output materialized bytes=104
		Input split bytes=112
		Combine input records=10
		Combine output records=8
		Reduce input groups=8
		Reduce shuffle bytes=104
		Reduce input records=8
		Reduce output records=8
		Spilled Records=16
		Shuffled Maps =1
		Failed Shuffles=0
		Merged Map outputs=1
		GC time elapsed (ms)=71
		CPU time spent (ms)=0
		Physical memory (bytes) snapshot=0
		Virtual memory (bytes) snapshot=0
		Total committed heap usage (bytes)=321912832
	Shuffle Errors
		BAD_ID=0
		CONNECTION=0
		IO_ERROR=0
		WRONG_LENGTH=0
		WRONG_MAP=0
		WRONG_REDUCE=0
	File Input Format Counters 
		Bytes Read=63
	File Output Format Counters 
		Bytes Written=66
22/06/09 20:47:49 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
apache	1
hadoop	2
http	1
hue	1
mapreduce	1
oozie	1
sqoop	2
test	1

## check from a browser
## hadoop-hdfs - http://localhost:50070/explorer.html#/
## hadoop-mapreduce - http://localhost:8088/cluster

## mr.sh

#!/bin/sh

usage() {
    echo "Usage: $0 standalone"
    echo "       $0 start [yarn]"
    echo "       $0 stop [yarn]"
    echo "       $0 pseudo_dist"
}

standalone() {
    cd $HADOOP_HOME/etc/hadoop
    ln -sf core-site.xml.orig core-site.xml
    ln -sf hdfs-site.xml.orig hdfs-site.xml
    ln -sf yarn-site.xml.orig yarn-site.xml
    rm -f mapred-site.xml
    cd $HADOOP_HOME

    mkdir -p input
    echo test apache hadoop hadoop sqoop hue mapreduce sqoop oozie http > input/in.txt
    rm -rf output
    hadoop jar $jarfile wordcount input output
    cat output/*
}

start() {
    cd $HADOOP_HOME/etc/hadoop
    ln -sf core-site.xml.pseudo core-site.xml
    ln -sf hdfs-site.xml.pseudo hdfs-site.xml
    if [ $# -eq 0 ]; then
        ln -sf yarn-site.xml.orig yarn-site.xml
        rm -f mapred-site.xml
    else
        ln -sf yarn-site.xml.pseudo yarn-site.xml
        ln -sf mapred-site.xml.pseudo mapred-site.xml
    fi

    cd $HADOOP_HOME
    # hdfs namenode -format
    if [ "x`jps | grep -w "NameNode"`" = "x" ]; then
        start-dfs.sh
    fi
    hdfs dfsadmin -safeoper leave
    if [ $# -eq 1 -a "x`jps | grep -w "NodeManager"`" = "x" ]; then
        start-yarn.sh
        mr-jobhistory-daemon.sh start historyserver
    fi
}

stop() {
    if [ $# -eq 1 ]; then
        mr-jobhistory-daemon.sh stop historyserver
        stop-yarn.sh
    fi
    stop-dfs.sh
}

pseudo_dist() {
    cd $HADOOP_HOME
    mkdir -p input
    echo test apache hadoop hadoop sqoop hue mapreduce sqoop oozie http > input/in.txt
    hdfs dfs -rm -f -r input
    hdfs dfs -mkdir input
    hdfs dfs -put input/in.txt input
    hdfs dfs -rm -f -r output
    hadoop jar $jarfile wordcount input output
    hdfs dfs -cat output/*
}

oper=$1
yarn=$2
jarfile=$HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.8.0.jar

if [ \( x$oper = "xstandalone" -o x$oper = "xstart" -o x$oper = "xstop" -o x$oper = "xpseudo_dist" \) \
        -a \( x$yarn = "x" -o x$yarn = "xyarn" \) ]; then
    echo "$oper $yarn"
    $oper $yarn
else
    usage
    exit 1
fi

reference: https://hadoop.apache.org/docs/r2.8.0/