对Titanic公开数据集进行缺失值统计




/*
参考资料:
http://spark.apache.org/docs/2.3.2/sql-programming-guide.html
https://blog.csdn.net/dreamer2020/article/details/51284789
https://blog.csdn.net/supersalome/article/details/78849581



*/


import org.apache.spark.ml.feature.{Imputer, StandardScaler}
import org.apache.spark.ml.feature.{StringIndexer, OneHotEncoderEstimator}
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}


val titanicDFCsv  = (spark.read.format("csv")
  .option("sep", ",")
  .option("inferSchema", "true")
  .option("header", "true")
  .load("/titanic_data/train.csv"))
  
/*

scala> titanicDFCsv.printSchema
root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)

*/  
  
  
//统计Cabin取值分布情况

/*

scala> titanicDFCsv.groupBy("Cabin").agg(count("Cabin") as "cnts").sort(desc("cnts")).show()
+-----------+----+
|      Cabin|cnts|
+-----------+----+
|         G6|   4|
|C23 C25 C27|   4|
|    B96 B98|   4|
|    C22 C26|   3|
|          D|   3|
|       E101|   3|
|         F2|   3|
|        F33|   3|
|        B22|   2|
|        C65|   2|
|        B18|   2|
|         E8|   2|
|        D35|   2|
|       C126|   2|
|B51 B53 B55|   2|
|        B20|   2|
|      F G73|   2|
|        C78|   2|
|    B58 B60|   2|
|        C83|   2|
+-----------+----+
only showing top 20 rows

*/

titanicDFCsv.filter("Cabin is null").select("Cabin").show()
/*
scala> titanicDFCsv.filter("Cabin is null").select("Cabin").count()
res7: Long = 687
*/

//统计Cabin字段缺失值个数
/*
scala> titanicDFCsv.filter($"Cabin".isNull).count()
res46: Long = 687
*/

scala> titanicDFCsv.filter($"Cabin".isNull).show
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|          330877| 8.4583| null|       Q|
|          8|       0|     3|Palsson, Master. ...|  male| 2.0|    3|    1|          349909| 21.075| null|       S|
|          9|       1|     3|Johnson, Mrs. Osc...|female|27.0|    0|    2|          347742|11.1333| null|       S|
|         10|       1|     2|Nasser, Mrs. Nich...|female|14.0|    1|    0|          237736|30.0708| null|       C|
|         13|       0|     3|Saundercock, Mr. ...|  male|20.0|    0|    0|       A/5. 2151|   8.05| null|       S|
|         14|       0|     3|Andersson, Mr. An...|  male|39.0|    1|    5|          347082| 31.275| null|       S|
|         15|       0|     3|Vestrom, Miss. Hu...|female|14.0|    0|    0|          350406| 7.8542| null|       S|
|         16|       1|     2|Hewlett, Mrs. (Ma...|female|55.0|    0|    0|          248706|   16.0| null|       S|
|         17|       0|     3|Rice, Master. Eugene|  male| 2.0|    4|    1|          382652| 29.125| null|       Q|
|         18|       1|     2|Williams, Mr. Cha...|  male|null|    0|    0|          244373|   13.0| null|       S|
|         19|       0|     3|Vander Planke, Mr...|female|31.0|    1|    0|          345763|   18.0| null|       S|
|         20|       1|     3|Masselmani, Mrs. ...|female|null|    0|    0|            2649|  7.225| null|       C|
|         21|       0|     2|Fynney, Mr. Joseph J|  male|35.0|    0|    0|          239865|   26.0| null|       S|
|         23|       1|     3|"McGowan, Miss. A...|female|15.0|    0|    0|          330923| 8.0292| null|       Q|
|         25|       0|     3|Palsson, Miss. To...|female| 8.0|    3|    1|          349909| 21.075| null|       S|
|         26|       1|     3|Asplund, Mrs. Car...|female|38.0|    1|    5|          347077|31.3875| null|       S|
|         27|       0|     3|Emir, Mr. Farred ...|  male|null|    0|    0|            2631|  7.225| null|       C|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
only showing top 20 rows





titanicDFCsv.select(titanicDFCsv.columns.map(c => sum(col(c).isNull.cast("int")).alias(c)): _*).show
/*

scala> titanicDFCsv.select(titanicDFCsv.columns.map(c => sum(col(c).isNull.cast("int")).alias(c)): _*).show
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|          0|       0|     0|   0|  0|177|    0|    0|     0|   0|  687|       2|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
*/

//对于数值型变量,既可以使用$"Age" > 21, 也可以使用"Age > 21"
titanicDFCsv.filter($"Age" > 21).show()
/*
scala> titanicDFCsv.filter("Age > 21").show()
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|           17463|51.8625|  E46|       S|
|          9|       1|     3|Johnson, Mrs. Osc...|female|27.0|    0|    2|          347742|11.1333| null|       S|
|         12|       1|     1|Bonnell, Miss. El...|female|58.0|    0|    0|          113783|  26.55| C103|       S|
|         14|       0|     3|Andersson, Mr. An...|  male|39.0|    1|    5|          347082| 31.275| null|       S|
|         16|       1|     2|Hewlett, Mrs. (Ma...|female|55.0|    0|    0|          248706|   16.0| null|       S|
|         19|       0|     3|Vander Planke, Mr...|female|31.0|    1|    0|          345763|   18.0| null|       S|
|         21|       0|     2|Fynney, Mr. Joseph J|  male|35.0|    0|    0|          239865|   26.0| null|       S|
|         22|       1|     2|Beesley, Mr. Lawr...|  male|34.0|    0|    0|          248698|   13.0|  D56|       S|
|         24|       1|     1|Sloper, Mr. Willi...|  male|28.0|    0|    0|          113788|   35.5|   A6|       S|
|         26|       1|     3|Asplund, Mrs. Car...|female|38.0|    1|    5|          347077|31.3875| null|       S|
|         31|       0|     1|Uruchurtu, Don. M...|  male|40.0|    0|    0|        PC 17601|27.7208| null|       C|
|         34|       0|     2|Wheadon, Mr. Edwa...|  male|66.0|    0|    0|      C.A. 24579|   10.5| null|       S|
|         35|       0|     1|Meyer, Mr. Edgar ...|  male|28.0|    1|    0|        PC 17604|82.1708| null|       C|
|         36|       0|     1|Holverson, Mr. Al...|  male|42.0|    1|    0|          113789|   52.0| null|       S|
|         41|       0|     3|Ahlin, Mrs. Johan...|female|40.0|    1|    0|            7546|  9.475| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
only showing top 20 rows

*/


//此处的等号有2个,注意跟"||"、&&结合使用时候等号的差别(有三个等号)
/*
scala> titanicDFCsv.filter("Age == 21").show()
+-----------+--------+------+--------------------+------+----+-----+-----+-----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|           Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+-----------------+-------+-----+--------+
|         38|       0|     3|Cann, Mr. Ernest ...|  male|21.0|    0|    0|       A./5. 2152|   8.05| null|       S|
|         52|       0|     3|Nosworthy, Mr. Ri...|  male|21.0|    0|    0|       A/4. 39886|    7.8| null|       S|
|         57|       1|     2|   Rugg, Miss. Emily|female|21.0|    0|    0|       C.A. 31026|   10.5| null|       S|
|         73|       0|     2|Hood, Mr. Ambrose Jr|  male|21.0|    0|    0|     S.O.C. 14879|   73.5| null|       S|
|        103|       0|     1|White, Mr. Richar...|  male|21.0|    0|    1|            35281|77.2875|  D26|       S|
|        107|       1|     3|Salkjelsvik, Miss...|female|21.0|    0|    0|           343120|   7.65| null|       S|
|        116|       0|     3|Pekoniemi, Mr. Ed...|  male|21.0|    0|    0|STON/O 2. 3101294|  7.925| null|       S|
|        121|       0|     2|Hickman, Mr. Stan...|  male|21.0|    2|    0|     S.O.C. 14879|   73.5| null|       S|
|        174|       0|     3|Sivola, Mr. Antti...|  male|21.0|    0|    0|STON/O 2. 3101280|  7.925| null|       S|
|        392|       1|     3|Jansson, Mr. Carl...|  male|21.0|    0|    0|           350034| 7.7958| null|       S|
|        403|       0|     3|Jussila, Miss. Ma...|female|21.0|    1|    0|             4137|  9.825| null|       S|
|        409|       0|     3|Birkeland, Mr. Ha...|  male|21.0|    0|    0|           312992|  7.775| null|       S|
|        422|       0|     3| Charters, Mr. David|  male|21.0|    0|    0|       A/5. 13032| 7.7333| null|       Q|
|        437|       0|     3|"Ford, Miss. Dool...|female|21.0|    2|    2|       W./C. 6608| 34.375| null|       S|
|        492|       0|     3| Windelov, Mr. Einar|  male|21.0|    0|    0| SOTON/OQ 3101317|   7.25| null|       S|
|        495|       0|     3|Stanley, Mr. Edwa...|  male|21.0|    0|    0|        A/4 45380|   8.05| null|       S|
|        502|       0|     3| Canavan, Miss. Mary|female|21.0|    0|    0|           364846|   7.75| null|       Q|
|        624|       0|     3|Hansen, Mr. Henry...|  male|21.0|    0|    0|           350029| 7.8542| null|       S|
|        625|       0|     3|"Bowen, Mr. David...|  male|21.0|    0|    0|            54636|   16.1| null|       S|
|        628|       1|     1|Longley, Miss. Gr...|female|21.0|    0|    0|            13502|77.9583|   D9|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+-----------------+-------+-----+--------+
only showing top 20 rows
*/





//对字符型变量进行过滤
scala> titanicDFCsv.filter($"Cabin".equalTo("E101")).show()
+-----------+--------+------+--------------------+------+----+-----+-----+------+-----+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|Ticket| Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+------+-----+-----+--------+
|        124|       1|     2| Webber, Miss. Susan|female|32.5|    0|    0| 27267| 13.0| E101|       S|
|        304|       1|     2| Keane, Miss. Nora A|female|null|    0|    0|226593|12.35| E101|       Q|
|        718|       1|     2|"Troutt, Miss. Ed...|female|27.0|    0|    0| 34218| 10.5| E101|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------+-----+-----+--------+


//逻辑连接符 &&(并)、||(或)
/*
scala> titanicDFCsv.filter($"Cabin".equalTo("E101")&&$"Age">21).show()
+-----------+--------+------+--------------------+------+----+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+------+----+-----+--------+
|        124|       1|     2| Webber, Miss. Susan|female|32.5|    0|    0| 27267|13.0| E101|       S|
|        718|       1|     2|"Troutt, Miss. Ed...|female|27.0|    0|    0| 34218|10.5| E101|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------+----+-----+--------+
*/


//统计Cabin取值为"E101"且Age取值为27的记录
/*
scala> titanicDFCsv.filter($"Cabin".equalTo("E101")&&$"Age"===27.0).show()
+-----------+--------+------+--------------------+------+----+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+------+----+-----+--------+
|        718|       1|     2|"Troutt, Miss. Ed...|female|27.0|    0|    0| 34218|10.5| E101|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------+----+-----+--------+
*/


//统计Cabin取值为"E101"或Age取值为27的记录,注意数值型变量有三个等号
/*
scala> titanicDFCsv.filter($"Cabin".equalTo("E101")||$"Age"===27.0).show()
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          9|       1|     3|Johnson, Mrs. Osc...|female|27.0|    0|    2|          347742|11.1333| null|       S|
|         42|       0|     2|Turpin, Mrs. Will...|female|27.0|    1|    0|           11668|   21.0| null|       S|
|        124|       1|     2| Webber, Miss. Susan|female|32.5|    0|    0|           27267|   13.0| E101|       S|
|        147|       1|     3|"Andersson, Mr. A...|  male|27.0|    0|    0|          350043| 7.7958| null|       S|
|        217|       1|     3|Honkanen, Miss. E...|female|27.0|    0|    0|STON/O2. 3101283|  7.925| null|       S|
|        222|       0|     2|Bracken, Mr. James H|  male|27.0|    0|    0|          220367|   13.0| null|       S|
|        304|       1|     2| Keane, Miss. Nora A|female|null|    0|    0|          226593|  12.35| E101|       Q|
|        322|       0|     3|    Danoff, Mr. Yoto|  male|27.0|    0|    0|          349219| 7.8958| null|       S|
|        378|       0|     1|Widener, Mr. Harr...|  male|27.0|    0|    2|          113503|  211.5|  C82|       C|
|        552|       0|     2|Sharp, Mr. Perciv...|  male|27.0|    0|    0|          244358|   26.0| null|       S|
|        608|       1|     1|Daniel, Mr. Rober...|  male|27.0|    0|    0|          113804|   30.5| null|       S|
|        621|       0|     3| Yasbeck, Mr. Antoni|  male|27.0|    1|    0|            2659|14.4542| null|       C|
|        682|       1|     1|  Hassab, Mr. Hammad|  male|27.0|    0|    0|        PC 17572|76.7292|  D49|       C|
|        718|       1|     2|"Troutt, Miss. Ed...|female|27.0|    0|    0|           34218|   10.5| E101|       S|
|        725|       1|     1|Chambers, Mr. Nor...|  male|27.0|    1|    0|          113806|   53.1|   E8|       S|
|        805|       1|     3|Hedman, Mr. Oskar...|  male|27.0|    0|    0|          347089|  6.975| null|       S|
|        822|       1|     3|   Lulic, Mr. Nikola|  male|27.0|    0|    0|          315098| 8.6625| null|       S|
|        824|       1|     3|  Moor, Mrs. (Beila)|female|27.0|    0|    1|          392096| 12.475| E121|       S|
|        867|       1|     2|Duran y More, Mis...|female|27.0|    1|    0|   SC/PARIS 2149|13.8583| null|       C|
|        887|       0|     2|Montvila, Rev. Ju...|  male|27.0|    0|    0|          211536|   13.0| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
*/


//统计Cabin取值为空或Age取值为空的记录
/*
scala> titanicDFCsv.filter($"Cabin".isNull||$"Age".isNull).count()
res47: Long = 706
*/

转载于:https://my.oschina.net/kyo4321/blog/2994649

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值