第1关:RDD转换为DataFrame实现文本文件数据源读取
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types._
import scala.collection.mutable
import java.text.SimpleDateFormat
object sparkSQL01 {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.master("local")
.appName("test")
.config("spark.sql.shuffle.partitions", "5")
.getOrCreate()
/** ************************ student表结构*****************************/
val studentRDD = spark.sparkContext.textFile("data/student.txt")
val StudentSchema: StructType = StructType(mutable.ArraySeq( //学生表
StructField("Sno", StringType, nullable = false), //学号
StructField("Sname", StringType, nullable = false), //学生姓名
StructField("Ssex", StringType, nullable = false), //学生性别
StructField("Sbirthday", StringType, nullable = true), //学生出生年月
StructField("SClass", StringType, nullable = true) //学生所在班级
))
val studentData = studentRDD.map(_.split(",")).map(attributes => Row(attributes(0),attributes(1),attributes(2),attributes(3),attributes(4)))
val studentDF = spark.createDataFrame(studentData,StudentSchema)
studentDF.createOrReplaceTempView("student")
/** ************************ teacher表结构*****************************/
val teacherRDD = spark.sparkContext.textFile("data/teacher.txt")
val TeacherSchema: StructType = StructType(mutable.ArraySeq( //教师表
StructField("Tno", StringType, nullable = false), //教工编号(主键)
StructField("Tname", StringType, nullable = false), //教工姓名
StructField("Tsex", StringType, nullable = false), //教工性别
StructField("Tyear", IntegerType, nullable = true), //教工出生年月
StructField("Prof", StringType, nullable = true), //职称
StructField("Depart", StringType, nullable = false) //教工所在部门
))
val teacherData = teacherRDD.map(_.split(",")).map(attributes => Row(attributes(0),attributes(1),attributes(2),attributes(3).toInt,attributes(4),attributes(5)))
val teacherDF = spark.createDataFrame(teacherData,TeacherSchema)
teacherDF.createOrReplaceTempView("teacher")
/** ************************ course表结构*****************************/
val courseRDD = spark.sparkContext.textFile("data/course.txt")
val CourseSchema: StructType = StructType(mutable.ArraySeq( //课程表
StructField("Cno", StringType, nullable = false), //课程号
StructField("Cname", StringType, nullable = false), //课程名称
StructField("Tno", StringType, nullable = false) //教工编号
))
val courseData = courseRDD.map(_.split(",")).map(attributes => Row(attributes(0),attributes(1),attributes(2)))
val courseDF = spark.createDataFrame(courseData,CourseSchema)
courseDF.createOrReplaceTempView("course")
/** ************************ score表结构*****************************/
val scoreRDD = spark.sparkContext.textFile("data/score.txt")
val ScoreSchema: StructType = StructType(mutable.ArraySeq( //成绩表
StructField("Sno", StringType, nullable = false), //学号(外键)
StructField("Cno", StringType, nullable = false), //课程号(外键)
StructField("Degree", IntegerType, nullable = true) //成绩
))
val scoreData = scoreRDD.map(_.split(",")).map(attributes => Row(attributes(0),attributes(1),attributes(2).toInt))
val scoreDF = spark.createDataFrame(scoreData,ScoreSchema)
scoreDF.createOrReplaceTempView("score")
/** ************************对各表的处理*****************************/
//按照班级排序显示所有学生信息
spark.sql("SELECT * FROM student ORDER BY Sno").show()
// 查询“计算机系”与“电子工程系“不同职称的教师的Tname和Prof。
spark.sql("SELECT tname, prof " +
"FROM Teacher " +
"WHERE prof NOT IN (SELECT a.prof " +
"FROM (SELECT prof " +
"FROM Teacher " +
"WHERE depart = 'department of computer' " +
") a " +
"JOIN (SELECT prof " +
"FROM Teacher " +
"WHERE depart = 'department of electronic engineering' " +
") b ON a.prof = b.prof) ").orderBy("tname").show(false)
//显示性别为nv的教师信息
teacherDF.filter("Tsex = 'female'").show(false)
//显示不重复的教师部门信息
teacherDF.select("Depart").distinct().show(false)
val maxsc = scoreDF.agg("Degree"->"max").show()
val meansc = scoreDF.groupBy("Cno").agg("Degree"->"mean").orderBy("Cno").show()
// meansc.write.format("json").save("mean.json")
}
}