people.json
{"id":1, "name":"Ganymede", "age":32} {"id":2, "name":"Lilei", "age":19} {"id":3, "name":"Lily", "age":25} {"id":4, "name":"Hanmeimei", "age":25} {"id":5, "name":"Lucy", "age":37} {"id":6, "name":"Tom", "age":27}
price.json
{"errNo":"0","content": []}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkContext, SparkConf}
object DataFrameTest {
def main (args: Array[String]) {
//日志显示级别
Logger.getLogger("org.apache.spark").setLevel(Level.WARN);
//初始化
val conf=new SparkConf().setAppName("DataFrameTest").setMaster("local")
val sc=new SparkContext(conf)
val sqlConetext=new SQLContext(sc)
//val df=sqlConetext.read.json("file:\\D:\\Program Files (x86)\\JetBrains\\workspace\\first_20171215\\src\\main\\resource\\json")
val df=sqlConetext.read.json("src\\main\\resource\\json\\people.json")
val df_price=sqlConetext.read.json("src\\main\\resource\\json\\price.json")
//查看df中的数据
df.show()
//查看df和df_price关联之后的结果
val df_join:DataFrame= df_price.join(df,df("id")===df_price("errNo"))
df_join.show()
//查看df和df_price左关联之后的结果,joinType可以是”inner”、“left”、“right”、“full”分别对应inner join, left join, right join, full join,默认值是”inner”
val df_left_join= df_price.join(df,df("id")===df_price("errNo"),"left")
df_left_join.show();
//查看Schema
df.printSchema()
//过滤某个字段的值(gt是大于的意思)
df.filter(df.col("age").gt(32)).show()
//查询多个字段,plus为加上某值
df.select(df.col("name"),df.col("age").plus(1)).show()
//count group 某个字段的值
df.groupBy(df.col("age")).count().show()
//使用lambda表达式对数据遍历
df.select(df.col("name"),df.col("age"),df.col("id")).foreach(x=>{
println(x)
})
}
}
初级阶段