Dataframe基本操作
主要对于DataFrame学习的一些简单操作,代码如下:
import org.apache.spark.sql.SparkSession
/**
* DataFrame 基本操作
*/
object DataFrameApp {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.master("local[2]")
.appName("DataFrameApp")
.getOrCreate()
val peopleDF = spark.read.format("json").load("file:///usr/local/spark/examples/src/main/resources/people.json")
//输出dataframe对应的schema
peopleDF.printSchema()
//输出前20条信息(默认值),可以指定
peopleDF.show()
//查询某列所有的数据 :select name from table
peopleDF.select("name").show()
//查询某几列的数据并做相应计算:select name, age + 10 as new age from table
peopleDF.select(peopleDF.col("name"), (peopleDF.col("age") + 10).as("new age")).show()
//查询年龄大于19的 : select * from table where age > 19
peopleDF.filter(peopleDF.col("age") > 19).show()
//根据某一列进行分组,然后再进行聚合操作 select age, count(1) from table group by age
peopleDF.groupBy("age").count().show()
spark.stop()
}
}
进阶操作
DataFrame与RDD互相操作—反射方式
import org.apache.spark.sql.SparkSession
/**
* DataFrame与RDD互相操作实现---反射实现
*/
object DataFrameRDDApp {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName("DataFrameRDDApp")
.master("local[2]")
.getOrCreate()
//RDD => DataFrame
val rdd = spark.sparkContext.textFile("/home/hadoop/infos.txt")
//注意:需要导入隐式转换
import spark.implicits._
val infoDF = rdd.map(_.split(",")).map(line => Info(line(0).toInt, line(1), line(2).toInt)).toDF
infoDF.show()
//进行简单查询操作
infoDF.filter(infoDF.col("age") > 30).show()
//通过转化为info表然后调用sqlAPI进行操作
infoDF.createOrReplaceTempView("info")
spark.sql("select * from info where age > 30").show()
spark.stop()
}
case class Info(id: Int, name: String, age: Int)
}
DataFrame与RDD互相操作—编程方式
def program(spark: SparkSession): Unit = {
/**
* RDD => DataFrame 编程方式
*/
// Create an RDD
val rdd = spark.sparkContext.textFile("/home/hadoop/infos.txt")
val infoRDD = rdd.map(_.split(",")).map(line => Row(line(0).toInt, line(1), line(2).toInt))
val structType = StructType(Array(StructField("id", IntegerType, true),
StructField("name", StringType, true),
StructField("age", IntegerType, true)))
val infoDF = spark.createDataFrame(infoRDD, structType)
infoDF.printSchema()
infoDF.show()
}
参考:
http://spark.apache.org/docs/latest/sql-programming-guide.html#datasets-and-dataframes