统计每一天的销售额(DSL风格)
需求: 数据源(时间,销售金额,用户ID)
在这个数据源中有不完整的记录,过滤掉这些数据
统计每一天的销售额(DSL风格)
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions.sum
import org.apache.spark.sql.{DataFrame, SparkSession}
object _01_Exercise {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().master("local").appName("Exercise").getOrCreate()
import spark.implicits._
// 数据源(时间,销售金额,用户ID)
val array: Array[String] = Array(
"2020-10-01,55,1122",
"2020-10-02,54,1134",
"2020-10-03,53",
"2020-10-01,56,1133",
"2020-10-02,60",
"2020-10-02,55,1133",
"2020-10-04,62",
"2020-10-03,70,1134",
"2020-10-04,53,1133",
"2020-10-06,56",
"2020-10-03,57,1133"
)
// 1. 构建RDD,读取数据并过滤不符合条件的数据
val rdd: RDD[String] = spark.sparkContext.parallelize(array)
// 2. 过滤并转换数据
val df: DataFrame = rdd.filter(_.split(",").length == 3) // 过滤条件,只保留三列都有数据的行
.map(line => {
val parts: Array[String] = line.split(",")
Log(parts(0), parts(1).toLong)
})
.toDF()
// 3. 统计每一天的销售额
df.groupBy("time")
.agg(sum('sal)).as("sum_sal")
.show()
}
}
case class Log(time: String, sal: Long)