spark SQL join中的一道题
package cn.kfc.dem08
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
case class fun(id:Int,name:String,country:String)
case class nation(nati:String,nat:String)
object joinTest {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.appName("Sql_join")
.master("local[*]")
.getOrCreate()
import spark.implicits._
val sc = spark.sparkContext
//val value= sc.textFile("file/join.txt") //--->方式1.从文档中读取
// val value = spark.read.textFile("file/join.txt") //-->方式2 spark.read.textFile
val value = sc.parallelize(Array("1,laozhao,china", "2,laoli,usa")) //-->3.直接使用数组数据
//4.使用spark.createDataset方法
// val value: Dataset[(Int, String, String)] = spark.createDataset(Array("1,laozhao,china", "2,laoli,usa")) //-->配合元组,样例类使用
.map(line => {
val splits = line.split(",")
val id = splits(0).toInt
val name = splits(1)
val country = splits(2)
//fun(id, name, country) //---> 创建样例类,与createDateFrame结合使用
(id, name, country) //-->创建元组,与配合toDF使用
Row(id,name,country) //--->配合StructType createDataFrame(value,sch)使用
})
//val df1= spark.createDataFrame(value) //--->1读取数据使用,方式2直接创建的就是DataFrame
// val df1 = value.toDF("id","name","country") //--> 元组使用
val sch: StructType = StructType(List(
StructField("id", IntegerType, true),
StructField("name", StringType, true),
StructField("country", StringType, true)
))
val df1 = spark.createDataFrame(value,sch)
val country: RDD[nation] = sc.textFile("file/nation.txt")
.map(line => {
val splits = line.split(",")
val nati = splits(0)
val nat = splits(1)
nation(nati, nat)
})
val df2: DataFrame = spark.createDataFrame(country)
//第一种:创建视图,使用sql
/* df1.createTempView("personinfo")
df2.createTempView("nationinfo")
spark.sql("selbleType, true)ect id,name,p.country,n.nat from personinfo p inner join nationinfo n on p.country=n.nati ")
.show()
*/
//第二种方式:DSL
val df3 = df1.join(df2,$"country" ===$"nati","inner")
// val df3 = df1.join(df2,"country") //-->样例类函数将列名称(国家)定义相同时,可以使用此方法
df3.show()
sc.stop()
spark.close()
}
}