1、SparkSQL
SparkSQL学习手册
package SparkSql
import org.apache.spark.sql.{SQLContext, SaveMode}
import org.apache.spark.{SparkConf, SparkContext}
object sparksql_test {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("sparksqltest")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
//json格式
val df = sqlContext.read.json("F:\\研究院\\人才工作站\\安装软件\\spark-1.6.0-bin-hadoop2.6\\examples\\src\\main\\resources\\people.json")
df.show() //打印成结构化表格形式
df.printSchema() //将属性打印成树形结构
df.select("name").show() //打印name
df.select(df("name"), df("age") + 1).show() //打印出name,age+1
df.filter(df("age") > 21).show() //age超过21的打印出来
df.groupBy("age").count().show() //根据age分组,并统计每个组的个数
//将每一行转成JavaBean
val people = sc.textFile("F:\\研究院\\人才工作站\\安装软件\\spark-1.6.0-bin-hadoop2.6\\examples\\src\\main\\resources\\people.txt").map(x => {
val parts = x.split(',')
val person = new Person()
person.setName(parts(0))
person.setAge(parts(1).toInt)
person
})
val schemaPeople = sqlContext.createDataFrame(people, classOf[Person]).registerTempTable("people")//JavaBean转RDD转临时表
val teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19").foreach(println)
//Spark SQL的默认数据源为Parquet格式
val df2 = sqlContext.read.load("F:\\研究院\\人才工作站\\安装软件\\spark-1.6.0-bin-hadoop2.6\\examples\\src\\main\\resources\\users.parquet")
df2.select("name", "favorite_color").write.mode(SaveMode.ErrorIfExists).save("namesAndFavColors.parquet")
//当数据源是其他格式时
val df3 = sqlContext.read.format("json").load("examples/src/main/resources/people.json")
df3.select("name", "age").write.format("parquet").mode(SaveMode.ErrorIfExists).save("namesAndAges.parquet")
}
}
package SparkSql;
import java.io.Serializable;
public class Person implements Serializable {
private String name;
private int age;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getAge() {
return age;
}
public void setAge(int age) {
this.age = age;
}
}