spark计算两个DataFrame的差集、交集、合集
闲话不说,直接上代码和结果
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{SQLContext, SparkSession}
/**
* @class DataFrameFun
* @author yyz
* @date 2021/01/24 20:59
* spark计算两个DataFrame的差集、交集、合集
* spark 计算两个dataframe 的差集、交集、合集,只选择某一列来对比比较好。新建两个 dataframe :
* https://www.cnblogs.com/TTyb/p/7991952.html
* */
object DataFrameFun {
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.OFF)
//创建SparkSession 环境方法一
// val conf = new SparkConf().setAppName("TTyb").setMaster("local")
// val sc = new SparkContext(conf)
// val spark = new SQLContext(sc)
//创建SparkSession 环境方法二 ,和方法一等价
val spark = SparkSession.builder().master("local").appName("AppName").getOrCreate()
val sentenceDataFrame1 = spark.createDataFrame(Seq(
(1, "asf"),
(2, "2143"),
(4, "f8934y")
)).toDF("label", "sentence")
sentenceDataFrame1.show()
val sentenceDataFrame = spark.createDataFrame(Seq(
(1, "asf"),
(2, "2143"),
(3, "rfds")
)).toDF("label", "sentence")
sentenceDataFrame.show()
println("1、差集 except")
val newDF1 = sentenceDataFrame1.select("sentence").except(sentenceDataFrame.select("sentence"))
newDF1.show()
println("2、交集 intersect")
val newDF2 = sentenceDataFrame1.select("sentence").intersect(sentenceDataFrame.select("sentence"))
newDF2.show()
println("3、合集 union")
val newDF3 = sentenceDataFrame1.select("sentence").union(sentenceDataFrame.select("sentence"))
newDF3.show()
println("4、合集最好去一下重 distinct :")
val newDF4 = sentenceDataFrame1.select("sentence").union(sentenceDataFrame.select("sentence")).distinct()
newDF4.show()
}
}
/***
+-----+--------+
|label|sentence|
+-----+--------+
| 1| asf|
| 2| 2143|
| 4| f8934y|
+-----+--------+
+-----+--------+
|label|sentence|
+-----+--------+
| 1| asf|
| 2| 2143|
| 3| rfds|
+-----+--------+
1、差集 except
+--------+
|sentence|
+--------+
| f8934y|
+--------+
2、交集 intersect
+--------+
|sentence|
+--------+
| asf|
| 2143|
+--------+
3、合集 union
+--------+
|sentence|
+--------+
| asf|
| 2143|
| f8934y|
| asf|
| 2143|
| rfds|
+--------+
4、合集最好去一下重 distinct :
+--------+
|sentence|
+--------+
| rfds|
| asf|
| 2143|
| f8934y|
+--------+
* */