1. 准备安装包
Win10 64位系统
Vmware 10
Centos 6.4
jdk-7u80-linux-x64.rpm
Hadoop-2.7.1.tar.gz
scala-2.11.6.tgz
spark-2.0.1-bin-hadoop2.7.tgz
2. 安装vmware workstations,新建虚拟机master,一路enter
3. 安装jdk
3.1. sudo rpm -ivh jdk-7u80-linux-x64.rpm
3.2. 设置java环境变量
Sudo gedit /etc/profile
在最后面增加:
#set java environment
export JAVA_HOME=/usr/java/jdk1.7.0_80 //注意若下载了其他版本,注意变通
export CLASSPATH=.:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar
export PATH=$PATH:$JAVA_HOME/bin
3.3. 验证java环境变量
echo $JAVA_HOME
4. 安装hadoop
4.1. 解压
tar -zxvf /usr/mywork/package/hadoop-2.7.1.tar.gz -C /usr/mywork/software
4.2. 配置环境变量
udo gedit /etc/profile
# set hadoop environment
export HADOOP_HOME=/usr/mywork/software/hadoop-2.7.1
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
export HADOOP_LOG_DIR=$HADOOP_HOME/logs #新建logs文件夹
export YARN_LOG_DIR=$HADOOP_LOG_DIR
生效:source /etc/profile
4.3. 验证环境变量
echo $HADOOP_HOME
4.4. 创建文件夹(在hadoop home目录下)
dfs,dfs/name,dfs/data,tmp
4.5. 修改配置
4.5.1. 修改hadoop-env.sh,yarn-env.sh和mapred-env.sh,添加
export JAVA_HOME=/usr/java/jdk1.7.0_80
4.5.2. 修改core-site.xml的内容
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://pmaster:9000</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/usr/mywork/software/hadoop-2.7.1/tmp</value>
</property>
</configuration>
4.5.3. 修改hdfs-site.xml
<configuration>
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>/usr/mywork/software/hadoop-2.7.1/dfs/name</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>/usr/mywork/software/hadoop-2.7.1/dfs/data</value>
</property>
</configuration>
4.5.4. 修改 Mapred-site.xml
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
4.5.5. 修改Yarn-site.xml
<configuration>
<!-- Site specific YARN configuration properties -->
<property>
<name>yarn.resourcemanager.hostname</name>
<value>pmaster</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
4.5.6. 修改slaves文件(/etc/hadoop/slaves)
pa
pb
4.6. 修改主机名
sudo hostname pmaster #暂时改变主机名,重启失效
sudo gedit /etc/sysconfig/network打开文件,设置 hostname=pmaster #永久改变主机名
reboot #如果没有执行第一步则需要重启生效
4.7. 绑定hostname与ip
sudo gedit /etc/hosts
#打开文件,添加如下内容,先随便填,等其他虚拟机获取了ip后再修改
192.168.184.129 pmaster
192.168.184.130 pa
192.168.184.131 pb
退出,ping pmaster 验证
4.8. 关闭防火墙
sudo service iptables stop #关闭当前的防火墙
sudo service iptables status #查看防火墙状态,验证是否关闭
sudo chkconfig iptables off #永久关闭防火墙
sudo chkconfig --list |grep iptables #检查防火墙设置是否关闭
5. 克隆虚拟机pa和pb,按照上述方法修改主机名为pa和pb,修/etc/hosts的ip
6. 配置ssh免密码连接(三台虚拟机都开机)
6.1. 使用absolutetelnet登录master虚拟机
6.2. 生成空密码密钥对
ssh-keygen -t rsa -P ‘’ -f /home/zls/.ssh/id_rsa
6.3. 在本机上生成authorized_keys,并验证能否对本机进行SSH无密码登陆
cd /home/zls/.ssh #使用当前用户,无需切换到root用户
cat id_rsa.pub >> authorized_keys #把公钥追加到授权的key中
chmod 600 authorized_keys #修改文件的权限
ssh localhost #能登录成功说明秘钥是成功的
注:如果如上操作后再次ssh登录还是需要密码,可以直接删除.ssh文件夹下的文件,重新生成秘钥对。
6.4. 使用absolutetelnet登录 pa 虚拟机,如上生成秘钥
ssh-copy-id -i id_rsa.pub pmaster #把公钥拷贝到pmaster并加入到master的授权key
6.5. 对 pb 做6.4的对应操作
6.6. 登录 pmaster 虚拟机
scp authorized_keys pa:/home/zls/.ssh/;scp authorized_keys pb:/home/zls/.ssh/
6.7. 通过absolutetelnet验证集群中各虚拟机之间的无密码登录
ssh pmaster;
ssh pa;
ssh pb;
ssh pmaster;
ssh pa;
ssh pb;
ssh pa;
ssh pmaster;
7. 格式化hadoop文件系统及测试
7.1. 使用telnet登录pmaster账户
7.2. cd /usr/mywork/software/hadoop-2.7.1/bin
hadoop namenode -format(格式化)
7.3. start-dfs.sh
(在主节点上启动了NameNode, SecondaryNameNode,在次节点上启动了DataNode)
7.4. start-yarn.sh
(在主节点上启动了ResourceManager,在次节点上启动了NodeManager)
7.5. mr-jobhistory-daemon.sh start historyserver
(在相应的节点上启动了ResourceManager)
(停止:mr-jobhistory-daemon.sh stop historyserver)
8. 安装scala
8.1. 下载 scala-2.11.6.tgz
8.2. 安装 sudo tar -zxf scala-2.11.6.tgz -C /usr/mywork/software
8.3. sudo /etc/profile,在末尾添加以下内容
#set scala environment
export SCALA_HOME=/usr/mywork/software/scala-2.11.6
export PATH=$PATH:$SCALA_HOME/bin
8.4. source /etc/profile
8.5. scala -version
9. 安装spark
9.1. 解压 sudo tar -zxf spark-2.0.1-bin-hadoop2.7.tgz -C /usr/mywork/software
9.2. 向profile文件末尾添加路径
sudo /etc/profile
#set spark environment
export SPARK_HOME=/usr/mywork/software/spark-2.0.1-bin-hadoop2.7
export PATH=$SPARK_HOME/bin:$PATH
source /etc/profile
9.3. 进入conf目录,修改slaves文件
cp slaves.template slaves
sudo vi slaves #打开文件,向文件添加(注销localhost)
pa
pb
9.4. 修改spark-env.sh文件
cp spark-env.sh.template spark-env.sh
sudo gedit spark-env.sh #打开文件,向文件添加
export JAVA_HOME=/usr/java/jdk1.7.0_80
export SCALA_HOME=/usr/mywork/software/scala-2.11.6
export HADOOP_HOME=/usr/mywork/software/hadoop-2.7.1
export HADOOP_CONF_DIR=/usr/mywork/software/hadoop-2.7.1/etc/hadoop
export SPARK_MASTER_IP=pmaster
export SPARK_WORKER_MEMORY=1g
10. 把scala安装文件夹、spark安装文件夹和/etc/profile跨界点拷贝给pa和pb
scp -r scala-2.11.6/ pb:/usr/mywork/software
scp -r spark-2.0.1-bin-hadoop2.7/ pb:/usr/mywork/software/
scp -r scala-2.11.6/ pa:/usr/mywork/software
scp -r spark-2.0.1-bin-hadoop2.7/ pa:/usr/mywork/software/
sudo scp -r /etc/profile pa:/etc/profile(切换到pa source一下/etc/profile)
sudo scp -r /etc/profile pb:/etc/profile(切换到pa source一下/etc/profile)
11. 启动spark集群
start-dfs.sh
start-yarn.sh
./start-all.sh(似乎要在spark的sbin路径下,否则会被当成hadoop集群的start-all命令)
12. 测试spark
spark-shell
hadoop fs -put README.md /test(当前位于spark-home)
var file = sc.textFile("hdfs://pmaster:9000/test/README.md")
var count = file.flatMap(line => line.split(" ")).map(word => (word,1)).reduceByKey(_+_)
count.collect
13. 在pmaster虚拟机上安装idea
13.1. tar -zxf ideaIC-2016.2.4.tar.gz
13.2. mv idea-IC-162.2032.8/ /usr/mywork/software/
13.3. 启动idea,安装插件scala-intellij-bin-2016.2.1.zip
14.新建scala-sbt项目,sbt下载很慢,可以修改
~/.IdeaIC2016.2/config/plugins/Scala/launcher/sbt-launch.jar/sbt/sbt.boot.properties
[repositories]
local
oschina: http://maven.oschina.net/content/groups/public/
jcenter: http://jcenter.bintray.com/
typesafe-ivy-releases:
http://repo.typesafe.com/typesafe/ivy-releases/, [organization]/[module]/[revision]/[type]s/[artifact](-[classifier]).[ext], bootOnly
maven-central: http://repo1.maven.org/maven2/
保存后点击idea右侧的sbt project的同步按钮,下载依赖包