Sparksql
1 加载文件
rdd: sc.testfile
dataFrame:spark.read.text 没有字段名,无法进行查询
dataset:spark.read.textFile
package chapter5
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{
DataFrame, Dataset, SparkSession}
object Demo {
def main(args: Array[String]): Unit = {
//创建SparkSql程序入口,也就是SparkSession
val spark: SparkSession = SparkSession.builder().appName("demo").master("local[*]").getOrCreate()
//因为sparkSession封装了sparkContext,所以可以直接调用
val sc: SparkContext = spark.sparkContext
//设置日志级别
sc.setLogLevel("WARN")
import spark.implicits._
//加载文件
val file: RDD[String] = sc.textFile("E:\\offcn\\Spark\\SparkDay01\\资料\\data\\words.txt")
val spliFile: RDD[String] = file.flatMap(_.split(" "))
val data: DataFrame = spark.read.text("E:\\offcn\\Spark\\SparkDay01\\资料\\data\\words.txt")
//data.flatMap(line=>line.sp)
val datas: Dataset[String] = spark.read.textFile("E:\\offcn\\Spark\\SparkDay01\\资料\\data\\words.txt")
val result: Dataset[String] = datas.flatMap(_.split(" "))
}
}
2.dataFrame 常用操作
package chapter5
import org.apache.spark.SparkContext
import org.apache.spark.sql.{
DataFrame, SparkSession}
object SparkSql_Demo {
def main(args: Array[String]): Unit = {
//创建SparkSql程序入口
val spark: SparkSession = SparkSession.builder()
.appName("demo")
.master("local[*]")
//如果要是操作hive数据仓库当中的表,要加hive支持
//.enableHiveSupport()
.getOrCreate()
//调用sparkContext
val sc: SparkContext = spark.sparkContext
//设置日志级别
sc.setLogLevel("WARN")
//加载数据
val personDF: DataFrame = spark.read.json("E:\\offcn\\Spark\\SparkDay01\\资料\\data\\people.json")
/*//查看整张表数据
personDF.show(2,false)
//查看schma信息
//personDF.printSchema()*/
//只查询name和age
//personDF.select("name","age").show()
//完全体
//personDF.select(personDF.col("name"),personDF.col("age"),personDF.col("age")+1).show()
//许多方法不是sparkSql的,需要引入
import spark.implicits._
//查询name和age,并把age加1岁
//personDF.select($"name",$"age",$"age"+1 as "ages").show()
//查询数据,过滤出来年龄大于30的人
//personDF.filter($"age">30).show()
//只要name和age,并要求年龄大于30
//personDF.select($"name",$"age").filter($"age">30).show()
//使用sql风格,首先将数据注册为一张表
personDF.createOrReplaceTempView("t_person")
//查询整张表信息
//spark.sql("select * from t_person").show()
//查询表数据,就要年龄大于30的
//spark.sql("select * from t_person where age>30").show()
//如果sql语句过长,可以写到外面
val sql =
"""
|select * from t_person
|where age>30
""".stripMargin
spark.sql(sql).show()
3 反射获取 dataset
package chapter5
import org.apache.spark.SparkContext
import org.apache.spark.sql.{
DataFrame, Dataset, SparkSession}
case class People(age:Long,name:String,hobby:String)
object Create_DateSet {
def main(args: Array[String]): Unit = {
//创建sparkSql程序入口
val spark: SparkSession = SparkSession.builder().appName("demo").master("local[*]").getOrCreate()
//调用sparkContex
val sc: SparkContext = spark.sparkContext
//设置日志级别
sc.setLogLevel("WARN")
//导包
import spark.implicits._
//加载json文件
val personDF: DataFrame = spark.read.json("E:\\offcn\\Spark\\SparkDay01\\资料\\data\\people.json")
//打印scema信息z`
//personDF.printSchema()
val personDS: Dataset[People] = personDF.as[People