spark dataset,dataframe学习

1.dataset to brodcast:

1.1

import org.apache.spark.sql.functions

val df:DataFrame = your_data_frame

val broadcasted_df = functions.broadcast(df)
other_df.join(broadcasted_df)

1.2

case class Employee(name:String, age:Int, depId: String)
case class Department(id: String, name: String)
 
val employeesRDD = sc.parallelize(Seq( 
    Employee("Mary", 33, "IT"), 
    Employee("Paul", 45, "IT"), 
    Employee("Peter", 26, "MKT"), 
    Employee("Jon", 34, "MKT"), 
    Employee("Sarah", 29, "IT"),
    Employee("Steve", 21, "Intern")
))
val departmentsRDD = sc.parallelize(Seq( 
    Department("IT", "IT  Department"),
    Department("MKT", "Marketing Department"),
    Department("FIN", "Finance & Controlling")
))
 
val employeesDF = employeesRDD.toDF
val departmentsDF = departmentsRDD.toDF
val employeesDF = employeesRDD.toDF
va departmentsDF = departmentsRDD.toDF
 
// materializing the department data
val tmpDepartments = broadcast(departmentsDF.as("departments"))
 
employeesDF.join(broadcast(tmpDepartments), 
   $"depId" === $"id",  // join by employees.depID == departments.id 
   "inner").show()

2.dataset join

val xs = Seq(("a", "foo", 2.0), ("x", "bar", -1.0)).toDS

val ys = Seq(("a", "foo", 2.0), ("y", "bar", 1.0)).toDS

xs.joinWith(ys, xs("_1") === ys("_1") && xs("_2") === ys("_2"), "left").show
scala> case class A(id: Long, name: String)
defined class A

scala> case class B(id: Long, name: String)
defined class B

scala> val as = Seq(A(0, "zero"), A(1, "one")).toDS
as: org.apache.spark.sql.Dataset[A] = [id: bigint, name: string]

scala> val bs = Seq(B(0, "zero"), B(1, "jeden")).toDS
bs: org.apache.spark.sql.Dataset[B] = [id: bigint, name: string]

scala> as.join(bs).where(as("id") === bs("id")).show
+---+----+---+-----+
| id|name| id| name|
+---+----+---+-----+
|  0|zero|  0| zero|
|  1| one|  1|jeden|
+---+----+---+-----+


scala> as.join(bs).where(as("id") === bs("id")).where(as("name") === bs("name")).show
+---+----+---+----+
| id|name| id|name|
+---+----+---+----+
|  0|zero|  0|zero|
+---+----+---+----+

3.多个datasetjoin

case class DS (id: Int, colA: String)
val ds1 = spark.read.parquet("X1").as[DS]
val ds2 = spark.read.parquet("X2").as[DS]
val ds3 = spark.read.parquet("X3").as[DS]
val ds4 = spark.read.parquet("X4").as[DS]
case class JoinedDS (colB: String)


val joinedDS = ds1.join(broadcast(ds2), Seq("id"), "inner")
.join(ds3, Seq("id", "colB"), "inner")
.join(ds4, Seq("id"), "inner")
.select(col("colB") 
.as[JoinedDS]

4.where in

import sqlContext.implicits._
df.where($"type".isin("type1","type2") and $"status".isin("completed","inprogress"))

val types = Seq("type1","type2")
val statuses = Seq("completed","inprogress")

df.where($"type".isin(types:_*) and $"status".isin(statuses:_*))

5. dataset if else (case when..)

 // Scala:
      people.select(when(people("gender") === "male", 0)
        .when(people("gender") === "female", 1)
        .otherwise(2))
   
 // Java:
      people.select(when(col("gender").equalTo("male"), 0)
        .when(col("gender").equalTo("female"), 1)
        .otherwise(2))

6.
In the Scala DSL for select, there are many ways to identify a Column:

From a symbol: 'name
From a string: $"name" or col(name)
From an expression: expr("nvl(name, 'unknown') as renamed")
To get a TypedColumn from Column you simply use myCol.as[T] and then use .map




评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值