基本框架
package week2
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.SparkContext._
object WordCount1 {
def main(args: Array[String]) {
if (args.length == 0) {
System.err.println("Usage: WordCount1 <file1>")
System.exit(1)
}
val conf = new SparkConf().setAppName("WordCount1")
val sc = new SparkContext(conf)
sc.stop()
}
}
RDD 方法
val num=sc.parallelize(1 to 10)
val doublenum = num.map(_*2)
val threenum = doublenum.filter(_ % 3 == 0)
threenum.collect
threenum.toDebugString
val num1=sc.parallelize(1 to 10,6)
val doublenum1 = num1.map(_*2)
val threenum1 = doublenum1.filter(_ % 3 == 0)
threenum1.collect
threenum1.toDebugString
threenum.cache()
val fournum = threenum.map(x=>x*x)
fournum.collect
fournum.toDebugString
threenum.unpersist()
num.reduce (_ + _)
num.take(5)
num.first
num.count
num.take(5).foreach(println)
val kv1=sc.parallelize(List(("A",1),("B",2),("C",3),("A",4),("B",5)))
kv1.sortByKey().collect
kv1.groupByKey().collect
kv1.reduceByKey(_+_).collect
val kv2=sc.parallelize(List(("A",4),("A",4),("C",3),("A",4),("B",5)))
kv2.distinct.collect
kv1.union(kv2).collect
val kv3=sc.parallelize(List(("A",10),("B",20),("D",30)))
kv1.join(kv3).collect
kv1.cogroup(kv3).collect
val kv4=sc.parallelize(List(List(1,2),List(3,4)))
kv4.flatMap(x=>x.map(_+1)).collect
val rdd1 = sc.textFile("hdfs://hadoop1:8000/dataguru/week2/directory/")
rdd1.toDebugString
val words=rdd1.flatMap(_.split(" "))
val wordscount=words.map(x=>(x,1)).reduceByKey(_+_)
wordscount.collect
wordscount.toDebugString
val rdd2 = sc.textFile("hdfs://hadoop1:8000/dataguru/week2/directory/*.txt")
rdd2.flatMap(_.split(" ")).map(x=>(x,1)).reduceByKey(_+_).collect
val rdd3 = sc.textFile("hdfs://hadoop1:8000/dataguru/week2/test.txt.gz")
rdd3.flatMap(_.split(" ")).map(x=>(x,1)).reduceByKey(_+_).collect
val rdd1 = sc.textFile("hdfs://hadoop1:8000/dataguru/data/SogouQ1.txt")
val rdd2=rdd1.map(_.split("\t")).filter(_.length==6)
rdd2.count()
val rdd3=rdd2.filter(_(3).toInt==1).filter(_(4).toInt==2)
rdd3.count()
rdd3.toDebugString
val rdd4=rdd2.map(x=>(x(1),1)).reduceByKey(_+_).map(x=>(x._2,x._1)).sortByKey(false).map(x=>(x._2,x._1))
rdd4.toDebugString
rdd4.saveAsTextFile("hdfs://hadoop1:8000/dataguru/week2/output1")
val rdd5 = sc.textFile("hdfs://hadoop1:8000/dataguru/data/SogouQ3.txt")
rdd5.cache()
rdd5.count()
rdd5.count()
val format = new java.text.SimpleDateFormat("yyyy-MM-dd")
case class Register (d: java.util.Date, uuid: String, cust_id: String, lat: Float,lng: Float)
case class Click (d: java.util.Date, uuid: String, landing_page: Int)
val reg = sc.textFile("hdfs://hadoop1:8000/dataguru/week2/join/reg.tsv").map(_.split("\t")).map(r => (r(1), Register(format.parse(r(0)), r(1), r(2), r(3).toFloat, r(4).toFloat)))
val clk = sc.textFile("hdfs://hadoop1:8000/dataguru/week2/join/clk.tsv").map(_.split("\t")).map(c => (c(1), Click(format.parse(c(0)), c(1), c(2).trim.toInt)))
reg.join(clk).take(2)