1、编写scala代码(使用类型推断)
import org.apache.spark.{SparkConf, SparkContext}
/**
* scala实现单词统计
*/
object WordcountScala {
def main(args: Array[String]): Unit = {
//创建配置对象
val conf = new SparkConf()
conf.setAppName("world count")
conf.setMaster("local")
//创建spark上下文
val sc = new SparkContext(conf)
val rdd1 = sc.textFile("d:\\java\\1.txt")
val rdd2 = rdd1.flatMap(line=>{
line.split(" ")
})
val rdd3 = rdd2.map(e=>{
(e , 1)
})
val rdd4 = rdd3.reduceByKey(_+_)
val arr = rdd4.collect()
arr.foreach(println(_))
}
}
2、scala编程(定义具体类型)
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* scala实现单词统计
*/
object WordcountScala2 {
def main(args: Array[String]): Unit = {
//创建配置对象
val conf:SparkConf = new SparkConf()
conf.setAppName("world count")
conf.setMaster("local")
//创建spark上下文
val sc:SparkContext = new SparkContext(conf)
//1.加载文件
val rdd1:RDD[String] = sc.textFile("d:\\java\\1.txt")
//2.压扁
val rdd2:RDD[String] = rdd1.flatMap((x:String)=>{
x.split(" ")
})
//3.标1成对
val rdd3:RDD[(String,Int)] = rdd2.map(e=>{
(e , 1)
})
//4.huajian化简
val rdd4:RDD[(String,Int)] = rdd3.reduceByKey((a:Int,b:Int)=>{
a + b
})
//将RDD转换成数组
val arr:Array[(String,Int)] = rdd4.collect()
//迭代数组
for(t <- arr){
println(t._1 + " : " + t._2)
}
}
}