一、通过定义类的方式指定schema来构建DataFrame
object GenerateDFByClass{
def main(args: Array[String]): Unit = {
//1.创建配置对象
val conf = new SparkConf()
.setAppName("generateDF") //指定应用名
.setMaster("local[2]") //指定为本地模式运行
//2.创建SqlContext
val sc = new SparkContext(conf);
val sqlContext = new SQLContext(sc);
//3.从文件系统中加载数据,生成rdd
//val fileRdd = sc.textFile("hdfs://hadoop01:9000/stu.txt").map(_.split(" "))
val fileRdd = sc.textFile("F:/stu.txt").map(_.split(" "))
//4.将RDD和case class关联
val stuRdd = fileRdd.map(line => Stu(line(0).toInt,line(1),line(2).toInt))
//5.导入隐式转换,如果不到人无法将RDD转换成DataFrame
//将RDD转换成DataFrame
import sqlContext.implicits._
val stuDF = stuRdd.toDF
//6.注册一张临时表
stuDF.registerTempTable("student");
//7.用sql语句查询分数最高的前三位
val resDF = sqlContext.sql("select * from student order by score desc limit 3")
//8.打印结果到控制台
resDF.show()
//9.将结果保存mysql数据库中
val prop = new Properties()
prop.setProperty("user","root")
prop.setProperty("password","root")
//将查询的数据追加到student表中
resDF.write.mode(SaveMode.Append).jdbc(
"jdbc:mysql://localhost:3306/db1",
"student",
prop
)
sc.stop()
}
}
//定义一个学生类,用于作为将rdd转化DataFrame时的映射
case class Stu(sid:Int,sname:String,score:Int)
二、通过StructType指定schema构建DataFrame
object GenerateDFByStructType{
def main(args: Array[String]): Unit = {
//1.创建配置对象
val conf = new SparkConf().setAppName("generateDF2").setMaster("local[2]")
//2.创建集群入口类对象
val sc = new SparkContext(conf)
//3.创建SqlContext对象
val sqlContext = new SQLContext(sc)
//4.读取本地文件系统的文件,生成rdd
val dataRdd = sc.textFile("F:/stu.txt").map(_.split(" "))
//5.通过StructType构建schema约束
val schema = StructType(
List(
StructField("sid",IntegerType,false),
StructField("sname",StringType,true),
StructField("score",IntegerType,true)
)
)
//6.//将RDD映射到rowRDD
val rowRdd = dataRdd.map(s => Row(s(0).toInt,s(1),s(2).toInt))
//7.将rdd转化为DataFrame
val resDF = sqlContext.createDataFrame(rowRdd,schema)
//8.打印结果
resDF.show();
//9.以json格式保存结果数据
resDF.write.mode(SaveMode.Append).json("F:/res")
sc.stop()
}
}