知识点:
1,创建新ROW
2, ROW新增字段
3,广播
出现的问题
1,广播流后,使用流时出现空指针异常,不知具体原因是什么,改用先将流collect后再广播,就正常了。
2,流不能嵌套使用。不能再factDF中直接使用pairDF
package spark
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.{DateType, DoubleType, StringType, StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, DataFrameReader, Dataset, Encoders, Row, SparkSession}
import java.sql.Date
object SparkGeektime02_2 {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName(this.getClass.getSimpleName)
val session: SparkSession = SparkSession.builder()
.master("local[*]")
.appName(this.getClass.getSimpleName)
.getOrCreate()
val sc: SparkContext = session.sparkContext
sc.setLogLevel("warn")
val pairDF: DataFrame = session
.read
.option("sep", ",")
.option("dateFormat", "yyyy-MM-dd")
.schema("startDate date, endDate date")
.csv("D:\\hadoop笔记\\hbase\\code\\scalaStudy\\data\\input\\datesintervers.txt")
val factDF: DataFrame = session.read
.option("sep", ",")
.option("dateFormat", "yyyy-MM-dd")
.schema("dim1 string, dim2 string, dim3 string, eventDate date, value double")
.csv("data/input/sales.txt")
pairDF.show()
factDF.show
val rows: Array[Row] = pairDF.collect()
val pairDFBrod = session.sparkContext.broadcast(rows)
// 如下广播流会导致在使用时报空指针异常
// val pairDFBrod: Broadcast[DataFrame] = session.sparkContext.broadcast(pairDF)
val encoder =
RowEncoder(
StructType(Seq(
StructField("dm1", StringType),
StructField("dm2", StringType),
StructField("dm3", StringType),
StructField("eventDate", DateType),
StructField("value", DoubleType),
StructField("rangeDate", StringType)
))
)
val resultDS: Dataset[Row] = factDF.mapPartitions(
it => {
var newSchema: StructType = null
it.map(
factRow => {
var outRow: Row = null
if (newSchema == null) {
newSchema = factRow.schema.add("dateRange", StringType)
}
pairDFBrod.value.foreach(
row => {
val eventDate: Date = factRow.getDate(3)
val startDate: Date = row.getDate(0)
val endDate: Date = row.getDate(1)
if (eventDate.compareTo(startDate) >= 0 && eventDate.compareTo(endDate) <=0) {
val buffer = Row.unapplySeq(factRow).get.toBuffer
buffer.append(startDate.toString+endDate.toString)
factRow.copy()
outRow = new GenericRowWithSchema(buffer.toArray,newSchema)
}
})
outRow
}
)
}
)(encoder)
import org.apache.spark.sql.functions._
val frame: DataFrame = resultDS.toDF()
.groupBy("dm1", "dm2", "dm3", "rangeDate")
.agg(sum("value") as "sum_value")
.select("sum_value", "dm1", "dm2", "dm3", "rangeDate")
frame.show()
resultDS.show()
}
}