文章目录
DSL(Domain-Specific Language)形式
join(self, other, on=None, how=None)
join()
operation takes parameters as below and returns DataFrame.
- param other: Right side of the join
- param on: a string for the join column name
- param how: default
inner
. Must be one ofinner
,cross
,outer
,full
,full_outer
,left
,left_outer
,right
,right_outer
,left_semi
, andleft_anti
.
inner, full, left, right, left semi, left anti, self join
inner join是默认方式
A.join(B,A.emp_dept_id == B.dept_id,"inner").show()
full join
# 以下三者等价
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"outer").show()
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"full").show()
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"fullouter").show()
left join
# 以下等价
empDF.join(deptDF,empDF("emp_dept_id") == deptDF("dept_id"),"left").show()
empDF.join(deptDF,empDF("emp_dept_id") == deptDF("dept_id"),"leftouter").show()
right join
# 以下等价
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"right").show()
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"rightouter").show()
left semi join
left semi join就是left join后右表的所有都抛弃
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"leftsemi").show()
left anti join
left anti join就是表A left join表B后,没有配上的部分右表的所有都抛弃
left semi join就是表A left join表B后,配上的部分右表的所有都抛弃
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"leftanti").show()
self join
empDF.alias("emp1").join(empDF.alias("emp2"), \
col("emp1.superior_emp_id") == col("emp2.emp_id"),"inner") \
.select(col("emp1.emp_id"),col("emp1.name"), \
col("emp2.emp_id").alias("superior_emp_id"), \
col("emp2.name").alias("superior_emp_name")) \
.show()
多表join
df1.join(df2,df1.id1 == df2.id2,"inner").join(df3,df1.id1 == df3.id3,"inner")
关联条件多个的join
df1和df2都有name,country,gender,id列,两表关联,以下两种形式等价
df1.join(df2, on=['name','country','gender','id'])
df= df1.join(df2, (df1.name == df2.name) & (df1.country == df2.country) & (df1.gender== df2.gender) & (df1.id== df2.id), 'inner')
sql形式
empDF.createOrReplaceTempView("EMP")
deptDF.createOrReplaceTempView("DEPT")
joinDF = spark.sql("select * from EMP e, DEPT d where e.emp_dept_id = d.dept_id").show(truncate=False)
joinDF2 = spark.sql("select * from EMP e INNER JOIN DEPT d ON e.emp_dept_id = d.dept_id").show(truncate=False)