Spark(day08) -- SpqrkSql-CSDN博客

本文链接：https://blog.csdn.net/qq_24522589/article/details/79703972

一.Introduction

http://spark.apache.org/docs/latest/sql-programming-guide.html

二.Syntax

1.Create RDD
2.Create case class(like table's schema)
3.Associate RDD with case class.
4.rdd.toDF


一.DSL style syntax

1.df.show

2.df.select(df.col("name")).show
  df.select(col("name"),col("age")).show (Recommand)
  df.select("name","age")

3.Order by age:
select * from t order by age  desc limit 1

4.Age > 25
df.filter(("age")>25).show
Attention：We must add "col" to tell spark the row
df.filter(col("age")>25).show

5.count the same age people
df.groupBy(col("age")).count().show()

6.Select all id, name, and age plus 1
df.select(col("id"),col("name"),col("age")+1).show

二.SQL style syntax

First,we need to register it as a temp-table

val df=studentRDD.toDF

df.registerTempTable("t_student")

1.Select  the oldest two.
sqlContext.sql("select * from t_student order by age desc limit 2").show

2.Select the information of table schema
sqlContext.sql("desc t_student").show

三.Code

StructTypeSchema:

package SparkSql

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

object StructTypeSchema {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("StructTypeSchema").setMaster("local[2]")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)

    //从指定位置加载RDD
    val studentRDD = sc.textFile("d://student.txt").map(_.split(","))
    //通过StructType指定结构类型 schema
    val schema = StructType(
      List(
        StructField("id",IntegerType,true),//是否为空
          StructField("name",StringType,true),
          StructField("age",IntegerType,true)
      )
    )
    val rowRDD=studentRDD.map(x=>Row(x(0).toInt,x(1),x(2).toInt))//将RDD映射到rowrdd
    val studentDF=sqlContext.createDataFrame(rowRDD,schema)//将schema信息应用到RowRDD上
    studentDF.registerTempTable("student")//注册成临时表
    val df =sqlContext.sql("select * from student order by age limit 4")//执行SQL:年龄最小的2个人 找出来
    //df.write.json(args(1))//输出数据
    //df.write.save("e://structTypeSchema03241505OUT")//save过时了
    //df.save("e://03241506OUT")
    df.save("d://03241506OUT","json")
    //df.save("d://88888")
    sc.stop()

  }
}

LoadDemo

package SparkSql

import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

object LoadDemo {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("LoadDemo").setMaster("local[2]")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)

    //加载直接是DF
    val df = sqlContext.read.json("d://03241506OUT")
    df.registerTempTable("t")
    sqlContext.sql("select * from t order by id  desc limit 1").show()


  }
}

*SparkSql with Mysql:

I recommend the follow blog:

http://www.cnblogs.com/wujiadong2014/p/6516598.html

MySqlSave:

package SparkSql

import java.util.Properties

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

object MysqlSave {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("MysqlSave").setMaster("local[2]")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)


    val studentRDD = sc.parallelize(Array("8 sb 100")).map(_.split(" "))
    //将Schema信息应用到RDD
    val schema = StructType(
      List(
        StructField("id",IntegerType,true),//是否为空
        StructField("name",StringType,true),
        StructField("age",IntegerType,true)
      )
    )
    val rowRDD=studentRDD.map(x=>Row(x(0).toInt,x(1),x(2).toInt))//将RDD映射到rowrdd
    val studentDF=sqlContext.createDataFrame(rowRDD,schema)//变成DF
    val prop = new Properties()
    prop.put("user","root")
    prop.put("password","root")
    //将数据追加到数据库里面
    studentDF.write.mode("append").jdbc("jdbc:mysql://192.168.16.100:3306/sparkSql","sparkSql.student",prop)
    sc.stop()
  }
}

MySqlLoad

package SparkSql

import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object MySqlLoad {
  def main(args: Array[String]): Unit = {
    LoggerLevels.setStreamingLogLevels()
    val conf = new SparkConf().setAppName("MySqlLoad").setMaster("local[2]")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)

    val jdbcDF = sqlContext.read.format("jdbc").options(Map("url" ->"jdbc:mysql://192.168.16.100:3306/sparkSql","driver" -> "com.mysql.jdbc.Driver","dbtable" -> "student","user" -> "root","password" -> "root")).load()
    jdbcDF.show()
    //查处student表年龄最大的学生的信息
    jdbcDF.registerTempTable("stu")
    val res = sqlContext.sql("select * from stu order by age desc limit 1").show()
   // res.write.json("d://res")
   // val loadDF = sqlContext.read.load("d://res")
  }
}

JdbcRDD:

package SparkSql

import java.sql.{DriverManager, ResultSet}

import org.apache.spark.rdd.JdbcRDD
import org.apache.spark.{SparkConf, SparkContext}

object JdbcRDD {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("JdbcRDD").setMaster("local[2]")
    val sc = new SparkContext(conf)

    def getConnection() = {
      //创建对象
      Class.forName("com.mysql.jdbc.Driver").newInstance()
      DriverManager.getConnection("jdbc:mysql://192.168.16.100:3306/sparkSql","root","root")
    }
    val jdbcRDD = new JdbcRDD(
      sc,
      getConnection,
      "select * from student where id >= ? and id <= ?",
      //分3个区,2<=id<=5
      2,5,3,
      r=>{
      val id = r.getInt(1)
        val age = r.getInt(3)
        (id,age)
    }
    )
    val jrdd = jdbcRDD.collect()
    println(jrdd.toBuffer)
  }
}