D25 Scala基础_scala为什么不能这样for(i<- 1 to 3;for(j<-1 to 3; if i!=j)-CSDN博客

本文链接：https://blog.csdn.net/u014253445/article/details/77600667

spark的start-all.sh启动命令与hadoop的start-all.sh冲突，现在重命名。勿忘！

①spark来进行wordcount

sc.textFile("/home/hadoop/words.txt").flatMap(_.split(" ")).map((_, 1)).reduceByKey(_+_).collect

②spark来进行wordcount后排序

sc.textFile("/home/hadoop/words.txt").flatMap(_.split(" ")).map((_, 1)).reduceByKey(_+_).sortBy(_._2, false).collect

③spark来进行wordcount后排序后保存。输出保存的路径必须是不存在的文件夹下。

sc.textFile("/home/hadoop/words.txt").flatMap(_.split(" ")).map((_, 1)).reduceByKey(_+_).sortBy(_._2, false).saveAsTextFile("/home/hadoop/out")

open perspective设定编辑和运行模式为Scala，这样窗口就是Scala的变成窗口。

简单的Scala编程基础

   
   package test
object outPut {
 def main(args:Array[String]){
//①输出函数
 println("hehe")
//②使用val定义的变量值是不可变的，相当于java里用final修饰的变量
 val i =1
//③使用var定义的变量是可变得，在Scala中鼓励使用val
 var s ="hello"
//Scala编译器会自动推断变量的类型，必要的时候可以指定类型
//变量名在前，类型在后
 val str:String="itcast"
//④for循环遍历
for(i <-1 to 10)
 println(i)
//⑤支持混合类型表达式
 val z =if(i >1)1else"error"
//打印z的值
 println(z)
//⑥块编程
 val result ={
if(i <0){
-1
}elseif(i >=1){
1
}else{
"error"
}
}
//⑦for循环遍历字符串数组
 val arr =Array("a","b","c")
for(i <- arr)
 println(i)
//⑧高级for循环
//每个生成器都可以带一个条件，注意：if前面没有分号
for(i <-1 to 3; j <-1 to 3if i != j)
 print((10* i + j)+" ")
//⑨构建集合
 val x =for(i <-1 to 10) yield i *2//构建集合val
 println(x)
}
}

方法与函数区别：

函数的层次更小，可以作为参数传入到方法内。方法的层次更高些。

   
    
 //定义一个普通的方法
 def m2 (x:Int, y:Int) : Int= x + y
…………………………………………………………………………
//定义一个方法，方法m2参数要求是一个函数，函数的参数必须是两个Int类型，返回值类型也是Int类型
 def m1(f:(Int,Int)=>Int) :Int={  f(2,6) }                          
//定义一个函数f1，参数是两个Int类型，返回值是一个Int类型
 val f1 =(x:Int, y:Int)=> x + y
 
//再定义一个函数f2
 val f2 =(m:Int, n:Int)=> m * n
 def main(args:Array[String]){
//调用m1方法，并传入f1函数
  val r1 = m1(f1)
 println(r1)
//调用m1方法，并传入f2函数
 val r2 = m1(f2)
 println(r2)
}

定义方法：

定义函数：

关于数组的操作

   
   val arr1 =ArrayBuffer[Int]()
 println(arr1)
//追加整个数组用++=
 arr1 ++=Array(2,3)
 println(arr1)
//追加若干元素用+=
 arr1 +=(2,3)
 println(arr1)
//追加数组缓冲
 arr1++=ArrayBuffer(2,3)
 println(arr1)
//在位置0处添加元素1
 arr1.insert(0,1)
 println(arr1)
//删除位置0处的元素
 arr1.remove(0)
 println(arr1)

    
    //Map
 val myMap =Map(("hehe",1),("nima",2),("shabi",3))
 println(myMap.get("hehe"))
//元组
 val t,(a, b, c, d)=("hadoop","spark",3.14,19941101)
 println(t._1)
 println(t._2)
 println(t._3)
 println(t._4)
//zip将两个数组关联的操作 
 val source =Array(20,30,40,50)
 val name =Array("ni","wo","ta","JJ")
 println(name.zip(source).toBuffer)
//构建一个可变列表，初始有3个元素1,2,3
 val lst0 =ListBuffer[Int](1,2,3)
//创建一个空的可变列表
 val lst1 =newListBuffer[Int]
//向lst1中追加元素，注意：没有生成新的集合
 lst1 +=4
 lst1.append(5)
//将lst1中的元素最近到lst0中， 注意：没有生成新的集合
 lst0 ++= lst1
//将lst0和lst1合并成一个新的ListBuffer 注意：生成了一个集合
 val lst2= lst0 ++ lst1
//将元素追加到lst0的后面生成一个新的集合
 val lst3 = lst0 :+5

关于对象

    
    classPerson(val name:String, val age:Int){
 println("执行主构造器")
private var gender ="male"
//用this关键字定义辅助构造器
 def this(name:String, age:Int, gender:String){
//每个辅助构造器必须以主构造器或其他的辅助构造器的调用开始
this(name, age)
 println("执行辅助构造器")
this.gender = gender
}
 def description()= name +" is "+ age +" years old "
}
 def main(args:Array[String]){
 val p1 =newPerson("曾祥雨",22)
 println(p1.description())
}

单机版WordCount

   
   object outPut {
 def main(args:Array[String]){
 val words =List("hello tom hello jerry","hello tom kitty hello hello")
 println(words)
 val lines = words.map(_.split(" ")).flatten
 println(lines)//List(hello, tom, hello, jerry, hello, tom, kitty, hello, hello)
 val word = lines.map((_,1))
 println(word)//List((hello,1), (tom,1), (hello,1), (jerry,1), (hello,1), (tom,1), (kitty,1), (hello,1), (hello,1))
 val group=word.groupBy(_._1)
 println(group)//Map(tom -> List((tom,1), (tom,1)), kitty -> List((kitty,1)), jerry -> List((jerry,1)), hello -> List((hello,1), (hello,1), (hello,1), (hello,1), (hello,1)))
 val num =group.map(_._1)
 println(num)//List(tom, kitty, jerry, hello) 
 val res =group.map(t =>(t._1, t._2.size))
 println(res)//输出Map(tom -> 2, kitty -> 1, jerry -> 1, hello -> 5)
 val finalRes=res.toList.sortBy(_._2)//默认增序
 println(finalRes)//List((kitty,1), (jerry,1), (tom,2), (hello,5))
}
}

    
    以上等价于下面几行代码

 val lines =List("hello tom hello jerry","hello jerry","hello kitty")
 println(lines)
 val res = lines.flatMap(_.split(" ")).map((_,1)).groupBy(_._1).mapValues(_.foldLeft(0)(_ + _._2))
 println(res)
 val finalRes = lines.flatMap(_.split(" ")).map((_,1)).groupBy(_._1).map(t =>(t._1, t._2.size)).toList.sortBy(_._2).reverse
  
 println(finalRes)

常用的转换操作：

   
   最常用的转换操作有两个：map和filter，map(func)是将func应用到所有元素，得到一个新的RDD。
filter是将func返回为true的元素过滤出来，组成一个新的RDD。
一些比较常用的转换如下：

map(func)       返回一个新的分布式数据集，将数据源的每一个元素传递给函数 func 映射组成。
filter(func)    返回一个新的数据集，从数据源中选中一些元素通过函数 func 返回true。
flatMap(func)  类似于 map，但是每个输入项能被映射成多个输出项(所以 func 必须返回一个Seq，而不是单个 item)。
union(otherDataset)    两个RDD求并集
intersection(otherDataset)   两个RDD求交集
groupByKey()  作用于(K,V)的数据集，依据K对值进行归并，返回一个(K,Iterable)
reduceByKey(func)作用于(K,V)的数据集，依据K对值使用func进行归约，返回一个(K,V)数据集
sortByKey([asending])   返回一个依据K进行排序的数据集
最常用的动作就是reduce，将数据集归约为一个结果。
一些比较常用的动作如下：
 
reduce(func)  按照func函数对数据集进行归约，func接受两个参数，返回一个结果，须满足结合律和交换律，以便于分布式计算。
count()返回数据集的元素个数
first()返回第一个元素
take(n)  以数组形式返回集合的前n个元素
saveAsTextFile(path)将数据集保存为文本文件

    
    val textFile = sc.textFile("hdfs://...")//读取hdfs文件，转换为以行为单位的文本集合 
val counts = textFile.flatMap(line => line.split(" "))//转换，将行字符串转换为单词，组成新的RDD 
.map(word =>(word,1))//转换，将单词转换为词频统计 
.reduceByKey(_ + _)//转换，根据key值进行归约 
counts.saveAsTextFile("hdfs://...")//保存