[root@localhost soft]# ls
jdk1.7.0_79 spark-1.1.0-bin-hadoop1
jdk-7u79-linux-x64.tar.gz spark-1.1.0-bin-hadoop1.tgz
[root@localhost soft]# cd jdk1.7.0_79
[root@localhost jdk1.7.0_79]# ls
bin lib src.zip
COPYRIGHT LICENSE THIRDPARTYLICENSEREADME-JAVAFX.txt
db man THIRDPARTYLICENSEREADME.txt
include README.html
jre release
[root@localhost jdk1.7.0_79]# pwd
/root/soft/jdk1.7.0_79
[root@localhost jdk1.7.0_79]# vi /etc/pro
profile profile.d/ protocols
[root@localhost jdk1.7.0_79]# vi /etc/profile
[root@localhost jdk1.7.0_79]# source /etc/profile
[root@localhost jdk1.7.0_79]# java -version
java version "1.7.0_79"
Java(TM) SE Runtime Environment (build 1.7.0_79-b15)
Java HotSpot(TM) 64-Bit Server VM (build 24.79-b02, mixed mode)
[root@localhost jdk1.7.0_79]# pwd
/root/soft/jdk1.7.0_79
[root@localhost jdk1.7.0_79]# cd ..
[root@localhost soft]# ls
jdk1.7.0_79 spark-1.1.0-bin-hadoop1
jdk-7u79-linux-x64.tar.gz spark-1.1.0-bin-hadoop1.tgz
[root@localhost soft]# cd spark-1.1.0-bin-hadoop1
[root@localhost spark-1.1.0-bin-hadoop1]# ls
bin conf examples LICENSE python RELEASE
CHANGES.txt ec2 lib NOTICE README.md sbin
[root@localhost spark-1.1.0-bin-hadoop1]# cd conf
[root@localhost conf]# ls
fairscheduler.xml.template slaves
log4j.properties.template spark-defaults.conf.template
metrics.properties.template spark-env.sh.template
[root@localhost conf]# vi spark-env.sh.template
[root@localhost conf]# cd ..
[root@localhost spark-1.1.0-bin-hadoop1]# ./bin/spark-shell 或者--master spark://localhost:7077 --executor-memory 3g
arkDriver@192.168.88.152:58247/user/HeartbeatReceiver
16/04/30 22:57:10 INFO SparkILoop: Created spark context..
Spark context available as sc.
scala>
安装ide
https://d1opms6zj7jotq.cloudfront.net/idea/ideaIU-2016.1.1.tar.gz
[root@localhost soft]# tar -zxf ideaIU-2016.1.1.tar.gz
[root@localhost soft]# cd idea-IU-145.597.3
[root@localhost idea-IU-145.597.3]# ./bin/idea.sh
package week2
import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.SparkContext._
class SougouQA {
def main(args: Array[String]) {
if (args.length == 0) {
System.err.println("Usage: WordCount1 <file1>")
System.exit(1)
}
val conf = new SparkConf().setAppName("SougouQA")
val sc = new SparkContext(conf)
val rdd1=sc.textFile("file:///root/soft/SogouQ1.txt")
val rdd2=rdd1.map(_.split("\t")).filter(_.length==6)
rdd2.count();
val rdd3=rdd2.filter(_(3).toInt==1).filter(_(4).toInt==2)
rdd3.count()
rdd3.toDebugString
val rdd4=rdd2.map(x=>(x(1),1)).reduceByKey(_+_).map(x=>(x._2,x._1)).sortByKey(false).map(x=>(x._2,x._1))
rdd4.toDebugString
rdd4.saveAsTextFile("file:///root/soft/output")
}
}
package week2
import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.SparkContext._
object Join {
def main(args: Array[String]) {
if (args.length == 0) {
System.err.println("Usage: WordCount1 <file1>")
System.exit(1)
}
val conf = new SparkConf().setAppName("WordCount1")
val sc = new SparkContext(conf)
val format = new java.text.SimpleDateFormat("yyyy-MM-dd")
case class Register(d: java.util.Date, uuid: String, cust_id: String, lat: Float, lng: Float)
case class Click(d: java.util.Date, uuid: String, landing_page: Int)
// val reg = sc.textFile("hdfs://hadoop1:8000/dataguru/week2/join/reg.tsv").map(_.split("\t")).map(r => (r(1), Register(format.parse(r(0)), r(1), r(2), r(3).toFloat, r(4).toFloat)))
val reg = sc.textFile("file:///root/soft/reg.tsv").map(_.split("\t")).map(r => (r(1), Register(format.parse(r(0)), r(1), r(2), r(3).toFloat, r(4).toFloat)))
// val clk = sc.textFile("hdfs://hadoop1:8000/dataguru/week2/join/clk.tsv").map(_.split("\t")).map(c => (c(1), Click(format.parse(c(0)), c(1), c(2).trim.toInt)))
val clk = sc.textFile("file:///root/soft//clk.tsv").map(_.split("\t")).map(c => (c(1), Click(format.parse(c(0)), c(1), c(2).trim.toInt)))
reg.join(clk).take(2)
sc.stop()
}
}
[root@localhost spark-1.1.0-bin-hadoop1]# bin/spark-submit --master spark://localhost:7077 --class week2.SougouQA /root/soft/week2.jar
hdfs dfs getmerge file:///root/soft/output result
//日志处理演示
//http://download.labs.sogou.com/dl/q.html 完整版(2GB):gz格式
//访问时间\t用户ID\t[查询词]\t该URL在返回结果中的排名\t用户点击的顺序号\t用户点击的URL
//SogouQ1.txt、SogouQ2.txt、SogouQ3.txt分别是用head -n 或者tail -n 从SogouQ数据日志文件中截取
//搜索结果排名第1,但是点击次序排在第2的数据有多少?
//检查block命令:bin/hdfs fsck /dataguru/data/SogouQ1.txt -files -blocks -locations
val rdd1 = sc.textFile("hdfs://hadoop1:8000/dataguru/data/SogouQ1.txt",9)
val rdd2=rdd1.map(_.split("\t")).filter(_.length==6)
val rdd3=rdd2.filter(_(3).toInt==1).filter(_(4).toInt==2)
rdd3.count()
rdd3.cache()
rdd3.count()
rdd3.count()
bin/spark-shell --master spark://hadoop1:7077 --executor-memory 3g
bin/spark-shell --master yarn-client --executor-memory 3g --num-executors 3
./bin/spark-submit --master yarn-cluster --class week2.SogouQA --executor-memory 3g week2.jar hdfs://hadoop1:8000/dataguru/data/SogouQ1.txt hdfs://hadoop1:8000/dataguru/week2/output3