一.Introduction
http://spark.apache.org/docs/latest/sql-programming-guide.html
二.Syntax
1.Create RDD
2.Create case class(like table's schema)
3.Associate RDD with case class.
4.rdd.toDF
一.DSL style syntax
1.df.show
2.df.select(df.col("name")).show
df.select(col("name"),col("age")).show (Recommand)
df.select("name","age")
3.Order by age:
select * from t order by age desc limit 1
4.Age > 25
df.filter(("age")>25).show
Attention:We must add "col" to tell spark the row
df.filter(col("age")>25).show
5.count the same age people
df.groupBy(col("age")).count().show()
6.Select all id, name, and age plus 1
df.select(col("id"),col("name"),col("age")+1).show
二.SQL style syntax
First,we need to register it as a temp-table
val df=studentRDD.toDF
df.registerTempTable("t_student")
1.Select the oldest two.
sqlContext.sql("select * from t_student order by age desc limit 2").show
2.Select the information of table schema
sqlContext.sql("desc t_student").show
三.Code
StructTypeSchema:
package SparkSql
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
object StructTypeSchema {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("StructTypeSchema").setMaster("local[2]")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
//从指定位置加载RDD
val studentRDD = sc.textFile("d://student.txt").map(_.split(","))
//通过StructType指定结构类型 schema
val schema = StructType(
List(
StructField("id",IntegerType,true),//是否为空
StructField("name",StringType,true),
StructField("age",IntegerType,true)
)
)
val rowRDD=studentRDD.map(x=>Row(x(0).toInt,x(1),x(2).toInt))//将RDD映射到rowrdd
val studentDF=sqlContext.createDataFrame(rowRDD,schema)//将schema信息应用到RowRDD上
studentDF.registerTempTable("student")//注册成临时表
val df =sqlContext.sql("select * from student order by age limit 4")//执行SQL:年龄最小的2个人 找出来
//df.write.json(args(1))//输出数据
//df.write.save("e://structTypeSchema03241505OUT")//save过时了
//df.save("e://03241506OUT")
df.save("d://03241506OUT","json")
//df.save("d://88888")
sc.stop()
}
}
LoadDemo
package SparkSql
import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
object LoadDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("LoadDemo").setMaster("local[2]")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
//加载直接是DF
val df = sqlContext.read.json("d://03241506OUT")
df.registerTempTable("t")
sqlContext.sql("select * from t order by id desc limit 1").show()
}
}
*SparkSql with Mysql:
I recommend the follow blog:
http://www.cnblogs.com/wujiadong2014/p/6516598.html
MySqlSave:
package SparkSql
import java.util.Properties
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
object MysqlSave {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("MysqlSave").setMaster("local[2]")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
val studentRDD = sc.parallelize(Array("8 sb 100")).map(_.split(" "))
//将Schema信息应用到RDD
val schema = StructType(
List(
StructField("id",IntegerType,true),//是否为空
StructField("name",StringType,true),
StructField("age",IntegerType,true)
)
)
val rowRDD=studentRDD.map(x=>Row(x(0).toInt,x(1),x(2).toInt))//将RDD映射到rowrdd
val studentDF=sqlContext.createDataFrame(rowRDD,schema)//变成DF
val prop = new Properties()
prop.put("user","root")
prop.put("password","root")
//将数据追加到数据库里面
studentDF.write.mode("append").jdbc("jdbc:mysql://192.168.16.100:3306/sparkSql","sparkSql.student",prop)
sc.stop()
}
}
MySqlLoad
package SparkSql
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
object MySqlLoad {
def main(args: Array[String]): Unit = {
LoggerLevels.setStreamingLogLevels()
val conf = new SparkConf().setAppName("MySqlLoad").setMaster("local[2]")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
val jdbcDF = sqlContext.read.format("jdbc").options(Map("url" ->"jdbc:mysql://192.168.16.100:3306/sparkSql","driver" -> "com.mysql.jdbc.Driver","dbtable" -> "student","user" -> "root","password" -> "root")).load()
jdbcDF.show()
//查处student表年龄最大的学生的信息
jdbcDF.registerTempTable("stu")
val res = sqlContext.sql("select * from stu order by age desc limit 1").show()
// res.write.json("d://res")
// val loadDF = sqlContext.read.load("d://res")
}
}
JdbcRDD:
package SparkSql
import java.sql.{DriverManager, ResultSet}
import org.apache.spark.rdd.JdbcRDD
import org.apache.spark.{SparkConf, SparkContext}
object JdbcRDD {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("JdbcRDD").setMaster("local[2]")
val sc = new SparkContext(conf)
def getConnection() = {
//创建对象
Class.forName("com.mysql.jdbc.Driver").newInstance()
DriverManager.getConnection("jdbc:mysql://192.168.16.100:3306/sparkSql","root","root")
}
val jdbcRDD = new JdbcRDD(
sc,
getConnection,
"select * from student where id >= ? and id <= ?",
//分3个区,2<=id<=5
2,5,3,
r=>{
val id = r.getInt(1)
val age = r.getInt(3)
(id,age)
}
)
val jrdd = jdbcRDD.collect()
println(jrdd.toBuffer)
}
}