spark安装

[root@localhost  soft]# ls
jdk1.7.0_79                spark-1.1.0-bin-hadoop1
jdk-7u79-linux-x64.tar.gz  spark-1.1.0-bin-hadoop1.tgz
[root@localhost  soft]# cd jdk1.7.0_79
[root@localhost  jdk1.7.0_79]# ls
bin        lib          src.zip
COPYRIGHT  LICENSE      THIRDPARTYLICENSEREADME-JAVAFX.txt
db         man          THIRDPARTYLICENSEREADME.txt
include    README.html
jre        release
[root@localhost  jdk1.7.0_79]# pwd
/root/soft/jdk1.7.0_79
[root@localhost  jdk1.7.0_79]# vi /etc/pro
profile    profile.d/ protocols  
[root@localhost jdk1.7.0_79]# vi /etc/profile
[root@localhost jdk1.7.0_79]# source /etc/profile
[root@localhost jdk1.7.0_79]# java -version
java version "1.7.0_79"
Java(TM) SE Runtime Environment (build 1.7.0_79-b15)
Java HotSpot(TM) 64-Bit Server VM (build 24.79-b02, mixed mode)
[root@localhost jdk1.7.0_79]# pwd
/root/soft/jdk1.7.0_79
[root@localhost jdk1.7.0_79]# cd ..
[root@localhost soft]# ls
jdk1.7.0_79                spark-1.1.0-bin-hadoop1
jdk-7u79-linux-x64.tar.gz  spark-1.1.0-bin-hadoop1.tgz
[root@localhost soft]# cd spark-1.1.0-bin-hadoop1
[root@localhost spark-1.1.0-bin-hadoop1]# ls
bin          conf  examples  LICENSE  python     RELEASE
CHANGES.txt  ec2   lib       NOTICE   README.md  sbin
[root@localhost spark-1.1.0-bin-hadoop1]# cd conf
[root@localhost conf]# ls
fairscheduler.xml.template   slaves
log4j.properties.template    spark-defaults.conf.template
metrics.properties.template  spark-env.sh.template
[root@localhost conf]# vi spark-env.sh.template
[root@localhost conf]# cd ..
[root@localhost spark-1.1.0-bin-hadoop1]# ./bin/spark-shell  或者--master spark://localhost:7077 --executor-memory 3g
arkDriver@192.168.88.152:58247/user/HeartbeatReceiver
16/04/30 22:57:10 INFO SparkILoop: Created spark context..
Spark context available as sc.

scala>

安装ide

https://d1opms6zj7jotq.cloudfront.net/idea/ideaIU-2016.1.1.tar.gz

[root@localhost soft]# tar -zxf ideaIU-2016.1.1.tar.gz

[root@localhost soft]# cd idea-IU-145.597.3

[root@localhost idea-IU-145.597.3]# ./bin/idea.sh

package week2
import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.SparkContext._
class SougouQA {
  def main(args: Array[String]) {
    if (args.length == 0) {
      System.err.println("Usage: WordCount1 <file1>")
      System.exit(1)
    }
    val conf = new SparkConf().setAppName("SougouQA")
    val sc = new SparkContext(conf)
    val rdd1=sc.textFile("file:///root/soft/SogouQ1.txt")
    val rdd2=rdd1.map(_.split("\t")).filter(_.length==6)
    rdd2.count();
    val rdd3=rdd2.filter(_(3).toInt==1).filter(_(4).toInt==2)
    rdd3.count()
    rdd3.toDebugString
    val rdd4=rdd2.map(x=>(x(1),1)).reduceByKey(_+_).map(x=>(x._2,x._1)).sortByKey(false).map(x=>(x._2,x._1))
    rdd4.toDebugString
    rdd4.saveAsTextFile("file:///root/soft/output")
    

  }
}

package week2
import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.SparkContext._

object Join {
  def main(args: Array[String]) {
    if (args.length == 0) {
      System.err.println("Usage: WordCount1 <file1>")
      System.exit(1)
    }

    val conf = new SparkConf().setAppName("WordCount1")
    val sc = new SparkContext(conf)
    val format = new java.text.SimpleDateFormat("yyyy-MM-dd")
    case class Register(d: java.util.Date, uuid: String, cust_id: String, lat: Float, lng: Float)
    case class Click(d: java.util.Date, uuid: String, landing_page: Int)
//    val reg = sc.textFile("hdfs://hadoop1:8000/dataguru/week2/join/reg.tsv").map(_.split("\t")).map(r => (r(1), Register(format.parse(r(0)), r(1), r(2), r(3).toFloat, r(4).toFloat)))
    val reg = sc.textFile("file:///root/soft/reg.tsv").map(_.split("\t")).map(r => (r(1), Register(format.parse(r(0)), r(1), r(2), r(3).toFloat, r(4).toFloat)))
  //  val clk = sc.textFile("hdfs://hadoop1:8000/dataguru/week2/join/clk.tsv").map(_.split("\t")).map(c => (c(1), Click(format.parse(c(0)), c(1), c(2).trim.toInt)))
    val clk = sc.textFile("file:///root/soft//clk.tsv").map(_.split("\t")).map(c => (c(1), Click(format.parse(c(0)), c(1), c(2).trim.toInt)))
    reg.join(clk).take(2)

    sc.stop()
  }
}



[root@localhost spark-1.1.0-bin-hadoop1]# bin/spark-submit --master spark://localhost:7077 --class week2.SougouQA /root/soft/week2.jar

hdfs dfs  getmerge file:///root/soft/output result


//日志处理演示
//http://download.labs.sogou.com/dl/q.html 完整版(2GB):gz格式
//访问时间\t用户ID\t[查询词]\t该URL在返回结果中的排名\t用户点击的顺序号\t用户点击的URL
//SogouQ1.txt、SogouQ2.txt、SogouQ3.txt分别是用head -n 或者tail -n 从SogouQ数据日志文件中截取

//搜索结果排名第1,但是点击次序排在第2的数据有多少?
//检查block命令:bin/hdfs fsck /dataguru/data/SogouQ1.txt -files -blocks -locations
val rdd1 = sc.textFile("hdfs://hadoop1:8000/dataguru/data/SogouQ1.txt",9)
val rdd2=rdd1.map(_.split("\t")).filter(_.length==6)
val rdd3=rdd2.filter(_(3).toInt==1).filter(_(4).toInt==2)
rdd3.count()
rdd3.cache()
rdd3.count()
rdd3.count()


bin/spark-shell --master spark://hadoop1:7077 --executor-memory 3g

bin/spark-shell --master yarn-client --executor-memory 3g --num-executors 3

./bin/spark-submit --master yarn-cluster --class week2.SogouQA --executor-memory 3g week2.jar hdfs://hadoop1:8000/dataguru/data/SogouQ1.txt  hdfs://hadoop1:8000/dataguru/week2/output3


转载于:https://my.oschina.net/goudingcheng/blog/668570

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值