Spark2 DataFrame数据框常用操作


  1. Spark session available as 'spark'.
  2. Welcome to Spark version 2.0.1

  3. Using Scala version 2.11.8

  4. import org.apache.spark.sql.SparkSession
  5. import org.apache.spark.sql.DataFrame
  6. import org.apache.spark.rdd.RDD
  7. import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
  8. import org.apache.spark.sql.Encoder

  9. scala>

  10. scala> val spark = SparkSession.builder().appName("Spark SQL basic example").config("spark.some.config.option", "some-value").getOrCreate()
  11. 16/11/05 15:40:31 WARN SparkSession$Builder: Use an existing SparkSession, some configuration may not take effect.
  12. spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@68d97cfb

  13. scala>

  14. scala> // For implicit conversions like converting RDDs to DataFrames
  15. scala> import spark.implicits._


  16. scala>

  17. scala> // 创建数据框

  18. scala> // val data1:DataFrame=spark.read.csv("hdfs://ns1/datafile/wangxiao/Affairs.csv")

  19. scala>

  20. scala> val data1: DataFrame = spark.read.option("header", true).format("csv").load("hdfs://ns1/datafile/wangxiao/Affairs.csv")
  21. data1: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]

  22. scala>

  23. scala> data1.printSchema()
  24. root
  25.  |-- affairs: string (nullable = true)
  26.  |-- gender: string (nullable = true)
  27.  |-- age: string (nullable = true)
  28.  |-- yearsmarried: string (nullable = true)
  29.  |-- children: string (nullable = true)
  30.  |-- religiousness: string (nullable = true)
  31.  |-- education: string (nullable = true)
  32.  |-- occupation: string (nullable = true)
  33.  |-- rating: string (nullable = true)


  34. scala>

  35. scala> data1.limit(10).show



  36. scala>

  37. scala> //##############################################

  38. scala> // 转换字符类型

  39. scala> val res1 = data1.select(
  40.      | data1("affairs").cast("Double"),
  41.      | data1("age").cast("Double"),
  42.      | data1("yearsmarried").cast("Double"),
  43.      | data1("religiousness").cast("Double"),
  44.      | data1("education").cast("Double"),
  45.      | data1("occupation").cast("Double"),
  46.      | data1("rating").cast("Double"),
  47.      | data1("gender").cast("String"),
  48.      | data1("children").cast("String"))
  49. res1: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 7 more fields]

  50. scala>

  51. scala> res1.printSchema()
  52. root
  53.  |-- affairs: double (nullable = true)
  54.  |-- age: double (nullable = true)
  55.  |-- yearsmarried: double (nullable = true)
  56.  |-- religiousness: double (nullable = true)
  57.  |-- education: double (nullable = true)
  58.  |-- occupation: double (nullable = true)
  59.  |-- rating: double (nullable = true)
  60.  |-- gender: string (nullable = true)
  61.  |-- children: string (nullable = true)


  62. scala>

  63. scala> //################################################

  64. scala> //创建RDD

  65. scala> val data2: RDD[String] = spark.sparkContext.textFile("hdfs://ns1/datafile/wangxiao/Affairs.txt")

  66. scala>

  67. scala> case class Affairs1(affairs: Int, gender: String, age: Int,
  68.      | yearsmarried: Double, children: String, religiousness: Int,
  69.      | education: Double, occupation: Double, rating: Int)
  70. defined class Affairs1

  71. scala>

  72. scala> // RDD转换成数据框

  73. scala> val res2 = data2.map { _.split(" ") }.map { line =>
  74.      | Affairs1(line(0).toInt, line(1).trim.toString(), line(2).toInt,
  75.      | line(3).toDouble, line(4).trim.toString(), line(5).toInt,
  76.      | line(6).toDouble, line(7).toDouble, line(8).toInt)
  77.      | }.toDF()
  78. res2: org.apache.spark.sql.DataFrame = [affairs: int, gender: string ... 7 more fields]

  79. scala>

  80. scala> res2.printSchema()
  81. root
  82.  |-- affairs: integer (nullable = false)
  83.  |-- gender: string (nullable = true)
  84.  |-- age: integer (nullable = false)
  85.  |-- yearsmarried: double (nullable = false)
  86.  |-- children: string (nullable = true)
  87.  |-- religiousness: integer (nullable = false)
  88.  |-- education: double (nullable = false)
  89.  |-- occupation: double (nullable = false)
  90.  |-- rating: integer (nullable = false)


  91. scala>

  92. scala> //###############################################

  93. scala> // 创建视图

  94. scala> res1.createOrReplaceTempView("Affairs")

  95. scala>

  96. scala> // 子查询

  97. scala> //val df1 = spark.sql("SELECT * FROM Affairs WHERE age BETWEEN 20 AND 25")

  98. scala> val df1 = spark.sql("select gender, age,rating from ( SELECT * FROM Affairs WHERE age BETWEEN 20 AND 25 ) t ")
  99. df1: org.apache.spark.sql.DataFrame = [gender: string, age: double ... 1 more field]

  100. scala>

  101. scala> df1.limit(10).show
  102. +------+----+------+
  103. |gender| age|rating|
  104. +------+----+------+
  105. | male|22.0| 3.0|
  106. |female|22.0| 3.0|
  107. | male|22.0| 5.0|
  108. |female|22.0| 4.0|
  109. |female|22.0| 4.0|
  110. |female|22.0| 5.0|
  111. |female|22.0| 5.0|
  112. |female|22.0| 5.0|
  113. |female|22.0| 5.0|
  114. |female|22.0| 5.0|
  115. +------+----+------+


  116. scala>

  117. scala> // 保存数据框到文件

  118. scala> data1.select("gender", "age", "education").write.format("csv").save("hdfs://ns1/datafile/wangxiao/data123.csv")

来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/29070860/viewspace-2127854/,如需转载,请注明出处,否则将追究法律责任。

转载于:http://blog.itpub.net/29070860/viewspace-2127854/

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值