今天用scala完成一件事情:当前行向前循环windows窗口计算这个窗口和。
先上最终代码,再说中间过程,最终代码是:
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions._
object test {
def rollingDataFrame(ss:SparkSession, dataFrame: DataFrame, windows:Int):List[Double]={
val len = dataFrame.count().toInt
val listValue = dataFrame
.select("int_column")
.rdd
.collect()
.map(_(0))
.toList
.map(_.toString.toInt)
val result =
(for (i <- 1 to len)
yield
if (i <= windows) 0.00
else listValue.slice(i-1 - windows, i-1).reduceLeft(_ + _)).toList //从i开始向上滑动windows窗口的数值之和,i<=windows为0
result
}
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName("test")
.master("local")
.getOrCreate()
import spark.implicits._
val df = Seq(
(1, "First Value", java.sql.Date.valueOf("2010-01-01")),
(2, "Second Value", java.sql.Date.valueOf("2010-02-01")),
(3, "First Value", java.sql.Date.valueOf("2010-01-02")),
(4, "Second Value", java.sql.Date.valueOf("2010-02-02")),
(5, "First Value", java.sql.Date.valueOf("2010-01-03")),
(6, "Second Value", java.sql.Date.valueOf("2010-02-03"))
).toDF("int_column", "string_column", "date_column")
df.show()
val result = rollingDataFrame(spark, df, 2)
println(result)
val result_addID = result.toDF("rollingData").withColumn("id", monotonically_increasing_id())
val df_addID = df.withColumn("id", monotonically_increasing_id())
val results_DF = df_addID.join(result_addID, "id")
result_addID.show()
df_addID.show()
results_DF.show()
}
}
输出结果是:
±–±---------±--------------±----------±----------+
| id|int_column|string_column|date_column|rollingData|
±–±---------±--------------±----------±----------+
| 0| 1| First Value| 2010-01-01| 0.0|
| 1| 2| Second Value| 2010-02-01| 0.0|
| 2| 3| First Value| 2010-01-02| 3.0|
| 3| 4| Second Value| 2010-02-02| 5.0|
| 4| 5| First Value| 2010-01-03| 7.0|
| 5| 6| Second Value| 2010-02-03| 9.0|
±–±---------±------------±----------±----------+
先scala命令行调试了下,如下:
scala> values
res17: List[Int] = List(1, 2, 3, 4, 5, 6, 7)
scala> values.slice(7-4, 7).reduceLeft(_+_)
res16: Int = 22
木有问题对吧,然后在IDE建一个DataFrame取出一列就有问题了,代码如下:
import org.apache.spark.sql.{DataFrame, SparkSession}
import scala.collection.JavaConverters._
object test {
// def rollingDataFrame(ss:SparkSession, dataFrame: DataFrame, windows:Int):DataFrame={
// val len = dataFrame.count().toInt
// val listValue = dataFrame
// .select("int_column")
// .collect()
// .map(_(0))
// .toList
//
// }
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName("test")
.master("local")
.getOrCreate()
import spark.implicits._
val df = Seq(
(1, "First Value", java.sql.Date.valueOf("2010-01-01")),
(2, "Second Value", java.sql.Date.valueOf("2010-02-01")),
(3, "First Value", java.sql.Date.valueOf("2010-01-02")),
(4, "Second Value", java.sql.Date.valueOf("2010-02-02")),
(5, "First Value", java.sql.Date.valueOf("2010-01-03")),
(6, "Second Value", java.sql.Date.valueOf("2010-02-03"))
).toDF("int_column", "string_column", "date_column")
df.show()
// val rollingDataFrame = rollingDataFrame(spark, df, 2)
var listValue = df
.select("int_column")
.rdd
.collect()
.map(_(0)) // return Array[Any]
// val listValue = df.select("int_column").as("String").collect().toList
println("listValue:{}:")
print(listValue)
var fillListVale:List[Int] = Nil
val len = df.count().toInt
val windows = 2
val i=4
var lv = List(3,4)
val ll = listValue.slice(i-windows, i).toList.reduceLeft(_ + _) //.map(_.toString.toInt)
print(ll)
}
}
报错,如下:
Error:(51, 80) type mismatch;
found : Any
required: String
val ll:List[Int] = listValue.slice(i-windows, i).toList.reduceLeft(_ + _) //.map(_.toString.toInt)
加入.map(_.toString.toInt)
就可以了,原因是var listValue = df .select("int_column") .rdd .collect() .map(_(0)) // return Array[Any]
返回是Array[Any]
类型,Any类型在reduceLeft(fold,foldLeft,rightLeft)是不允许的,需要做转换。
修改如下就可以了:
import org.apache.spark.sql.{DataFrame, SparkSession}
import scala.collection.JavaConverters._
object test {
// def rollingDataFrame(ss:SparkSession, dataFrame: DataFrame, windows:Int):DataFrame={
// val len = dataFrame.count().toInt
// val listValue = dataFrame
// .select("int_column")
// .collect()
// .map(_(0))
// .toList
//
// }
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName("test")
.master("local")
.getOrCreate()
import spark.implicits._
val df = Seq(
(1, "First Value", java.sql.Date.valueOf("2010-01-01")),
(2, "Second Value", java.sql.Date.valueOf("2010-02-01")),
(3, "First Value", java.sql.Date.valueOf("2010-01-02")),
(4, "Second Value", java.sql.Date.valueOf("2010-02-02")),
(5, "First Value", java.sql.Date.valueOf("2010-01-03")),
(6, "Second Value", java.sql.Date.valueOf("2010-02-03"))
).toDF("int_column", "string_column", "date_column")
df.show()
// val rollingDataFrame = rollingDataFrame(spark, df, 2)
var listValue = df
.select("int_column")
.rdd
.collect()
.map(_(0)) // return Array[Any]
// val listValue = df.select("int_column").as("String").collect().toList
println("listValue:{}:")
print(listValue.toList)
var fillListVale:List[Int] = Nil
val len = df.count().toInt
val windows = 2
val i=4
var lv = List(3,4)
val ll:List[Int] = listValue.slice(i-windows, i).toList.map(_.toString.toInt) //.reduceLeft(_ + _)
val result = ll.reduceLeft(_ + _)
print(result)
}
}