1.dataset to brodcast:
1.1
import org.apache.spark.sql.functions
val df:DataFrame = your_data_frame
val broadcasted_df = functions.broadcast(df)
other_df.join(broadcasted_df)
1.2
case class Employee(name:String, age:Int, depId: String)
case class Department(id: String, name: String)
val employeesRDD = sc.parallelize(Seq(
Employee("Mary", 33, "IT"),
Employee("Paul", 45, "IT"),
Employee("Peter", 26, "MKT"),
Employee("Jon", 34, "MKT"),
Employee("Sarah", 29, "IT"),
Employee("Steve", 21, "Intern")
))
val departmentsRDD = sc.parallelize(Seq(
Department("IT", "IT Department"),
Department("MKT", "Marketing Department"),
Department("FIN", "Finance & Controlling")
))
val employeesDF = employeesRDD.toDF
val departmentsDF = departmentsRDD.toDF
val employeesDF = employeesRDD.toDF
va departmentsDF = departmentsRDD.toDF
// materializing the department data
val tmpDepartments = broadcast(departmentsDF.as("departments"))
employeesDF.join(broadcast(tmpDepartments),
$"depId" === $"id", // join by employees.depID == departments.id
"inner").show()
2.dataset join
val xs = Seq(("a", "foo", 2.0), ("x", "bar", -1.0)).toDS
val ys = Seq(("a", "foo", 2.0), ("y", "bar", 1.0)).toDS
xs.joinWith(ys, xs("_1") === ys("_1") && xs("_2") === ys("_2"), "left").show
scala> case class A(id: Long, name: String)
defined class A
scala> case class B(id: Long, name: String)
defined class B
scala> val as = Seq(A(0, "zero"), A(1, "one")).toDS
as: org.apache.spark.sql.Dataset[A] = [id: bigint, name: string]
scala> val bs = Seq(B(0, "zero"), B(1, "jeden")).toDS
bs: org.apache.spark.sql.Dataset[B] = [id: bigint, name: string]
scala> as.join(bs).where(as("id") === bs("id")).show
+---+----+---+-----+
| id|name| id| name|
+---+----+---+-----+
| 0|zero| 0| zero|
| 1| one| 1|jeden|
+---+----+---+-----+
scala> as.join(bs).where(as("id") === bs("id")).where(as("name") === bs("name")).show
+---+----+---+----+
| id|name| id|name|
+---+----+---+----+
| 0|zero| 0|zero|
+---+----+---+----+
3.多个datasetjoin
case class DS (id: Int, colA: String)
val ds1 = spark.read.parquet("X1").as[DS]
val ds2 = spark.read.parquet("X2").as[DS]
val ds3 = spark.read.parquet("X3").as[DS]
val ds4 = spark.read.parquet("X4").as[DS]
case class JoinedDS (colB: String)
val joinedDS = ds1.join(broadcast(ds2), Seq("id"), "inner")
.join(ds3, Seq("id", "colB"), "inner")
.join(ds4, Seq("id"), "inner")
.select(col("colB")
.as[JoinedDS]
4.where in
import sqlContext.implicits._
df.where($"type".isin("type1","type2") and $"status".isin("completed","inprogress"))
val types = Seq("type1","type2")
val statuses = Seq("completed","inprogress")
df.where($"type".isin(types:_*) and $"status".isin(statuses:_*))
5. dataset if else (case when..)
// Scala:
people.select(when(people("gender") === "male", 0)
.when(people("gender") === "female", 1)
.otherwise(2))
// Java:
people.select(when(col("gender").equalTo("male"), 0)
.when(col("gender").equalTo("female"), 1)
.otherwise(2))
6.
In the Scala DSL for select, there are many ways to identify a Column:
From a symbol: 'name
From a string: $"name" or col(name)
From an expression: expr("nvl(name, 'unknown') as renamed")
To get a TypedColumn from Column you simply use myCol.as[T] and then use .map