1、以计算机名为host指定各网站的外网ip,如:
192.168.56.104 localhost.hadoop3
192.168.56.103 localhost.hadoop2
192.168.56.102 localhost.hadoop1
2、打通各服务器的server的无密码登录
3、下载安装:hadoop 对应版本的spark对应版本的scala对应版本的jdk对
4、下载安装sbt
5、配置全局变量,vim /etc/profile,PS1中有master换成各服务器的标识
export PS1="master:\W \u\$" export SPARK_HOME="/usr/local/spark2" export PATH="$PATH:$SPARK_HOME/bin" export SCALA_HOME="/usr/share/scala" export PATH="$PATH:$SCALA_HOME/bin" export FINDBUGS_HOME="/usr/local/findbugs" export PATH="$PATH:$FINDBUGS_HOME/bin" export PROTOBUF_HOME="/usr/local/protobuf" export PATH="$PATH:$PROTOBUF_HOME/bin" export HADOOP_HOME="/home/hadoop-2.7.4" export PATH="$PATH:$HADOOP_HOME/bin" export JAVA_HOME=/usr/lib/jvm/java export PATH="$PATH:$JAVA_HOME/bin" export PATH=$PATH:/usr/local/spark2/bin/
/home/hadoop-2.7.4/etc/hadoop/core-site.xml,各服务器的tmp_dir设置成不同目录
<?xml version="1.0" encoding="UTF-8"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <configuration> <!--配置NN在哪台机器以及它的端口,也可说是HDFS的入口 --> <property> <name>fs.defaultFS</name> <value>hdfs://master:9000</value> </property> <!-- HDFS工作目录的设置,默认是linux的/temp,每次linux重启会清空,hadoop中 的数据会全部丢失.--> <!-- 其它一些目录是以这个临时目录为基本目录的,如dfs.name.dir和dfs.name.edits.dir等--> <property> <name>hadoop.tmp.dir</name> <value>/tmp</value> </property> </configuration>
/home/hadoop-2.7.4/etc/hadoop/yarn-site.xml
<?xml version="1.0"?> <configuration> <property> <name>yarn.resourcemanager.hostname</name> <value>master</value> </property> <property> <name>yarn.nodemanager.pmem-check-enabled</name> <value>false</value> </property> </configuration>
/home/hadoop-2.7.4/etc/hadoop/slaves
slave1
slave2
6、spark配置
/usr/local/spark2/conf/spark_env.sh
export SPARK_MASTER_HOST=master SPARK_MASTER_HOST=master export HADOOP_CONF_DIR=/home/hadoop-2.7.4/etc/hadoop export HADOOP_NATIVE_LIB_DIR="/home/hadoop-2.7.4/lib/native" export LD_LIBRARY_PATH=/usr/lib64/atlas
/usr/local/spark2/conf/log4j.properties
log4j.rootCategory=INFO, console
改为:
log4j.rootCategory=WARN, console
/usr/local/spark2/conf/spark-defaults.conf
spark.master yarn
spark.yarn.jars hdfs://master:9000/spark_jars/*
/usr/local/spark2/conf/slaves
slave1
slave2
运行shell
hadooop namenode -format. #各服务器都执行 /home/hadoop-2.7.4/sbin/start-dfs.sh /home/hadoop-2.7.4/sbin/start-yarn.sh /usr/local/spark2/sbin/start-all.sh hadoop fs -mkdir /tmp hadoop fs -mkdir /tmp2 hadoop fs -mkdir /tmp3 hadoop fs -mkdir /spark_jars hadoop fs -put /usr/local/spark2/jars/* /spark-jars/ echo "done"
提交spark任务
spark-submit --master yarn --class aa.MyAls aa.jar
Scala代码 package aa import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.recommendation.{ALS,Rating} object MyAls { def main(args : Array[String]) ={ val conf=new SparkConf().setAppName("hwl.MyAls job") val sc=new SparkContext(conf) val trans = sc.textFile("hdfs://master:9000/home/trans.txt") val dat=trans.filter( line => !line.startsWith("#") ) val trueDat = dat.filter( line => !line.isEmpty() ) println("start line:") trueDat.collect().foreach(str=>println(str)) println("end line:") val ratings = trueDat.map{ line => val Array(uid, prduct, rating)=line.split(",") Rating(uid.toInt,prduct.toInt,rating.toFloat) } val model = ALS.train(ratings,8,10) val comment = model.recommendProducts(3,5) comment.foreach(v=>println(v)) println("done") } }