from pyspark.sql import SparkSession
if __name__ =='__main__':
spark = SparkSession.builder.getOrCreate()
df1 = spark.createDataFrame([[3],[1],[2],[2],[3],[4]], schema=['v1'])
df2 = spark.createDataFrame([[5],[1],[2],[2]], schema=['v2'])# df.printSchema()# 设置数据的表名
df1.createTempView("num1")
df2.createTempView("num2")# 编写sql实现操作# EXCEPT 排除# num2 在 num1不存在的数据
spark.sql("select * from num1 EXCEPT select * from num2").show()
spark.sql("select * from num1 MINUS select * from num2").show()# num2 在 num1 不存在的数据(不去重)
spark.sql("select * from num1 EXCEPT All (select * from num2)").show()
spark.sql("select * from num1 MINUS All (select * from num2)").show()print("111111111111111111111111")# 交集 INTERSECT
spark.sql("(select * from num1) INTERSECT (select * from num2)").show()
spark.sql("(select * from num1) INTERSECT DISTINCT (select * from num2)").show()# 不去重
spark.sql("select * from num1 INTERSECT ALL (select * from num2)").show()print("2222222222222222222222222")# 联合 UNION
spark.sql("(select * from num1) UNION (select * from num2)").show()
spark.sql("(select * from num1) UNION DISTINCT (select * from num2)").show()# 不去重
spark.sql("select * from num1 UNION ALL (select * from num2)").show()