- import org.apache.spark.sql.SparkSession
- import org.apache.spark.sql.Dataset
- import org.apache.spark.sql.Row
- import org.apache.spark.sql.DataFrame
- import org.apache.spark.sql.DataFrameReader
- import org.apache.spark.rdd.RDD
- import scala.math._
-
- val spark = SparkSession.builder().appName("Spark SQL basic example").config("spark.some.config.option", "some-value").getOrCreate()
-
- // For implicit conversions like converting RDDs to DataFrames
- import spark.implicits._
-
- scala> val data: DataFrame = spark.read.format("csv").option("header", false).load("hdfs://ns1/datafile/wangxiao/AffairsNA.csv")
- data: org.apache.spark.sql.DataFrame = [_c0: string, _c1: string ... 7 more fields]
-
-
-
- scala> val data1 = data.toDF("affairs", "gender", "age", "yearsmarried", "children", "religiousness", "education", "occupation", "rating")
- data1: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]
-
- scala> data1.limit(10).show
-
-
-
-
- scala> val resNull=data1.na.drop()
- resNull: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]
-
- scala> resNull.limit(10).show()
-
-
-
- scala> val res=data1.select("yearsmarried").na.drop()
- res: org.apache.spark.sql.DataFrame = [yearsmarried: string]
-
- scala> res.limit(10).show()
- +------------+
- |yearsmarried|
- +------------+
- | 10|
- | 15|
- | 15|
- | 1.5|
- | 15|
- | 4|
- | 15|
- | 1.5|
- | 4|
- | 15|
- +------------+
-
-
-
- scala> val res123=data1.na.fill("wangxiao123")
- res123: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]
-
- scala> res123.limit(10).show()
-
-
-
-
-
- scala> val res2=data1.na.fill(value="wangxiao111",cols=Array("gender","yearsmarried") )
- res2: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]
-
- scala> res2.limit(10).show()
- +-------+-----------+---+------------+--------+-------------+---------+----------+------+
- |affairs| gender|age|yearsmarried|children|religiousness|education|occupation|rating|
- +-------+-----------+---+------------+--------+-------------+---------+----------+------+
- | 0| male| 37| 10| no| 3| 18| 7| 4|
- | 0|wangxiao111| 27| wangxiao111| no| 4| 14| 6| null|
- | 0|wangxiao111| 32| wangxiao111| yes| 1| 12| 1| null|
- | 0|wangxiao111| 57| wangxiao111| yes| 5| 18| 6| null|
- | 0|wangxiao111| 22| wangxiao111| no| 2| 17| 6| null|
- | 0|wangxiao111| 32| wangxiao111| no| 2| 17| 5| null|
- | 0| female| 22| wangxiao111| no| 2| 12| 1| null|
- | 0| male| 57| 15| yes| 2| 14| 4| 4|
- | 0| female| 32| 15| yes| 4| 16| 1| 2|
- | 0| male| 22| 1.5| no| 4| 14| 4| 5|
- +-------+-----------+---+------------+--------+-------------+---------+----------+------+
-
-
-
- scala> val res3=data1.na.fill(Map("gender"->"wangxiao222","yearsmarried"->"wangxiao567") )
- res3: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]
-
- scala> res3.limit(10).show()
- +-------+-----------+---+------------+--------+-------------+---------+----------+------+
- |affairs| gender|age|yearsmarried|children|religiousness|education|occupation|rating|
- +-------+-----------+---+------------+--------+-------------+---------+----------+------+
- | 0| male| 37| 10| no| 3| 18| 7| 4|
- | 0|wangxiao222| 27| wangxiao567| no| 4| 14| 6| null|
- | 0|wangxiao222| 32| wangxiao567| yes| 1| 12| 1| null|
- | 0|wangxiao222| 57| wangxiao567| yes| 5| 18| 6| null|
- | 0|wangxiao222| 22| wangxiao567| no| 2| 17| 6| null|
- | 0|wangxiao222| 32| wangxiao567| no| 2| 17| 5| null|
- | 0| female| 22| wangxiao567| no| 2| 12| 1| null|
- | 0| male| 57| 15| yes| 2| 14| 4| 4|
- | 0| female| 32| 15| yes| 4| 16| 1| 2|
- | 0| male| 22| 1.5| no| 4| 14| 4| 5|
- +-------+-----------+---+------------+--------+-------------+---------+----------+------+
-
-
-
- scala> data1.filter("gender is null").select("gender").limit(10).show
- +------+
- |gender|
- +------+
- | null|
- | null|
- | null|
- | null|
- | null|
- +------+
-
-
- scala> data1.filter("gender is not null").select("gender").limit(10).show
- +------+
- |gender|
- +------+
- | male|
- |female|
- | male|
- |female|
- | male|
- | male|
- | male|
- | male|
- |female|
- |female|
- +------+
-
-
- scala> data1.filter( data1("gender").isNull ).select("gender").limit(10).show
- +------+
- |gender|
- +------+
- | null|
- | null|
- | null|
- | null|
- | null|
- +------+
-
-
- scala> data1.filter("gender<>''").select("gender").limit(10).show
- +------+
- |gender|
- +------+
- | male|
- |female|
- | male|
- |female|
- | male|
- | male|
- | male|
- | male|
- |female|
- |female|
- +------+
-
-
-
- scala> math.sqrt(-1.0)
- res21: Double = NaN
-
- scala> math.sqrt(-1.0).isNaN()
- res22: Boolean = true
来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/29070860/viewspace-2127858/,如需转载,请注明出处,否则将追究法律责任。
转载于:http://blog.itpub.net/29070860/viewspace-2127858/