Spark2 DataFrame 数据框空值查找和处理


  1. import org.apache.spark.sql.SparkSession
  2. import org.apache.spark.sql.Dataset
  3. import org.apache.spark.sql.Row
  4. import org.apache.spark.sql.DataFrame
  5. import org.apache.spark.sql.DataFrameReader
  6. import org.apache.spark.rdd.RDD
  7. import scala.math._

  8. val spark = SparkSession.builder().appName("Spark SQL basic example").config("spark.some.config.option", "some-value").getOrCreate()

  9. // For implicit conversions like converting RDDs to DataFrames
  10. import spark.implicits._

  11. scala> val data: DataFrame = spark.read.format("csv").option("header", false).load("hdfs://ns1/datafile/wangxiao/AffairsNA.csv")
  12. data: org.apache.spark.sql.DataFrame = [_c0: string, _c1: string ... 7 more fields]



  13. scala> val data1 = data.toDF("affairs", "gender", "age", "yearsmarried", "children", "religiousness", "education", "occupation", "rating")
  14. data1: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]

  15. scala> data1.limit(10).show




  16. scala> val resNull=data1.na.drop()
  17. resNull: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]

  18. scala> resNull.limit(10).show()



  19. scala> val res=data1.select("yearsmarried").na.drop()
  20. res: org.apache.spark.sql.DataFrame = [yearsmarried: string]

  21. scala> res.limit(10).show()
  22. +------------+
  23. |yearsmarried|
  24. +------------+
  25. | 10|
  26. | 15|
  27. | 15|
  28. | 1.5|
  29. | 15|
  30. | 4|
  31. | 15|
  32. | 1.5|
  33. | 4|
  34. | 15|
  35. +------------+



  36. scala> val res123=data1.na.fill("wangxiao123")
  37. res123: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]

  38. scala> res123.limit(10).show()





  39. scala> val res2=data1.na.fill(value="wangxiao111",cols=Array("gender","yearsmarried") )
  40. res2: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]

  41. scala> res2.limit(10).show()
  42. +-------+-----------+---+------------+--------+-------------+---------+----------+------+
  43. |affairs| gender|age|yearsmarried|children|religiousness|education|occupation|rating|
  44. +-------+-----------+---+------------+--------+-------------+---------+----------+------+
  45. | 0| male| 37| 10| no| 3| 18| 7| 4|
  46. | 0|wangxiao111| 27| wangxiao111| no| 4| 14| 6| null|
  47. | 0|wangxiao111| 32| wangxiao111| yes| 1| 12| 1| null|
  48. | 0|wangxiao111| 57| wangxiao111| yes| 5| 18| 6| null|
  49. | 0|wangxiao111| 22| wangxiao111| no| 2| 17| 6| null|
  50. | 0|wangxiao111| 32| wangxiao111| no| 2| 17| 5| null|
  51. | 0| female| 22| wangxiao111| no| 2| 12| 1| null|
  52. | 0| male| 57| 15| yes| 2| 14| 4| 4|
  53. | 0| female| 32| 15| yes| 4| 16| 1| 2|
  54. | 0| male| 22| 1.5| no| 4| 14| 4| 5|
  55. +-------+-----------+---+------------+--------+-------------+---------+----------+------+



  56. scala> val res3=data1.na.fill(Map("gender"->"wangxiao222","yearsmarried"->"wangxiao567") )
  57. res3: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]

  58. scala> res3.limit(10).show()
  59. +-------+-----------+---+------------+--------+-------------+---------+----------+------+
  60. |affairs| gender|age|yearsmarried|children|religiousness|education|occupation|rating|
  61. +-------+-----------+---+------------+--------+-------------+---------+----------+------+
  62. | 0| male| 37| 10| no| 3| 18| 7| 4|
  63. | 0|wangxiao222| 27| wangxiao567| no| 4| 14| 6| null|
  64. | 0|wangxiao222| 32| wangxiao567| yes| 1| 12| 1| null|
  65. | 0|wangxiao222| 57| wangxiao567| yes| 5| 18| 6| null|
  66. | 0|wangxiao222| 22| wangxiao567| no| 2| 17| 6| null|
  67. | 0|wangxiao222| 32| wangxiao567| no| 2| 17| 5| null|
  68. | 0| female| 22| wangxiao567| no| 2| 12| 1| null|
  69. | 0| male| 57| 15| yes| 2| 14| 4| 4|
  70. | 0| female| 32| 15| yes| 4| 16| 1| 2|
  71. | 0| male| 22| 1.5| no| 4| 14| 4| 5|
  72. +-------+-----------+---+------------+--------+-------------+---------+----------+------+



  73. scala> data1.filter("gender is null").select("gender").limit(10).show
  74. +------+
  75. |gender|
  76. +------+
  77. | null|
  78. | null|
  79. | null|
  80. | null|
  81. | null|
  82. +------+


  83. scala> data1.filter("gender is not null").select("gender").limit(10).show
  84. +------+
  85. |gender|
  86. +------+
  87. | male|
  88. |female|
  89. | male|
  90. |female|
  91. | male|
  92. | male|
  93. | male|
  94. | male|
  95. |female|
  96. |female|
  97. +------+


  98. scala> data1.filter( data1("gender").isNull ).select("gender").limit(10).show
  99. +------+
  100. |gender|
  101. +------+
  102. | null|
  103. | null|
  104. | null|
  105. | null|
  106. | null|
  107. +------+


  108. scala> data1.filter("gender<>''").select("gender").limit(10).show
  109. +------+
  110. |gender|
  111. +------+
  112. | male|
  113. |female|
  114. | male|
  115. |female|
  116. | male|
  117. | male|
  118. | male|
  119. | male|
  120. |female|
  121. |female|
  122. +------+



  123. scala> math.sqrt(-1.0)
  124. res21: Double = NaN

  125. scala> math.sqrt(-1.0).isNaN()
  126. res22: Boolean = true

来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/29070860/viewspace-2127858/,如需转载,请注明出处,否则将追究法律责任。

转载于:http://blog.itpub.net/29070860/viewspace-2127858/

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值