Spark SQL 应用示例

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types._
import org.apache.spark.sql.Row
import org.apache.spark.rdd.RDD
object PeopleDataStatistics2 {
 private val schemaString = "id,gender,height"
 def main(args: Array[String]) {
 if (args.length < 1) {
 println("Usage:PeopleDataStatistics2 filePath")
 System.exit(1)
 }
 val conf = new SparkConf().setAppName("Spark Exercise:People Data Statistics 2")
 val sc = new SparkContext(conf)
 val peopleDataRDD = sc.textFile(args(0))
 val sqlCtx = new SQLContext(sc)
 // this is used to implicitly convert an RDD to a DataFrame.
 import sqlCtx.implicits._
 val schemaArray = schemaString.split(",")
 val schema = StructType(schemaArray.map(fieldName => StructField(fieldName, StringType, true)))
 val rowRDD: RDD[Row] = peopleDataRDD.map(_.split(" ")).map(
                         eachRow => Row(eachRow(0), eachRow(1), eachRow(2)))
 val peopleDF = sqlCtx.createDataFrame(rowRDD, schema)
 peopleDF.registerTempTable("people")
 //get the male people whose height is more than 180
 val higherMale180 = sqlCtx.sql("select id,gender,
                     height from people where height > 180 and gender='M'")
 println("Men whose height are more than 180: " + higherMale180.count())
 println("<Display #1>")
 //get the female people whose height is more than 170
 val higherFemale170 = sqlCtx.sql("select id,gender,
                          height from people where height > 170 and gender='F'")
 println("Women whose height are more than 170: " + higherFemale170.count())
 println("<Display #2>")
 //Grouped the people by gender and count the number
 peopleDF.groupBy(peopleDF("gender")).count().show()
 println("People Count Grouped By Gender")
 println("<Display #3>")
 //
 peopleDF.filter(peopleDF("gender").equalTo("M")).filter(
                                   peopleDF("height") > 210).show(50)
 println("Men whose height is more than 210")
 println("<Display #4>")
 //
 peopleDF.sort($"height".desc).take(50).foreach { row => println(row(0) + "," + row(1) + "," + row(2)) }
 println("Sorted the people by height in descend order,Show top 50 people")
 println("<Display #5>")
 //
 peopleDF.filter(peopleDF("gender").equalTo("M")).agg(Map("height" -> "avg")).show()
 println("The Average height for Men")
 println("<Display #6>")
 //
 peopleDF.filter(peopleDF("gender").equalTo("F")).agg("height" -> "max").show()
 println("The Max height for Women:")
 println("<Display #7>")
 //......
 println("All the statistics actions are finished on structured People data.")
 }
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值