/*
参考资料:
http://spark.apache.org/docs/2.3.2/sql-programming-guide.html
https://blog.csdn.net/dreamer2020/article/details/51284789
https://blog.csdn.net/supersalome/article/details/78849581
*/
import org.apache.spark.ml.feature.{Imputer, StandardScaler}
import org.apache.spark.ml.feature.{StringIndexer, OneHotEncoderEstimator}
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
val titanicDFCsv = (spark.read.format("csv")
.option("sep", ",")
.option("inferSchema", "true")
.option("header", "true")
.load("/titanic_data/train.csv"))
/*
scala> titanicDFCsv.printSchema
root
|-- PassengerId: integer (nullable = true)
|-- Survived: integer (nullable = true)
|-- Pclass: integer (nullable = true)
|-- Name: string (nullable = true)
|-- Sex: string (nullable = true)
|-- Age: double (nullable = true)
|-- SibSp: integer (nullable = true)
|-- Parch: integer (nullable = true)
|-- Ticket: string (nullable = true)
|-- Fare: double (nullable = true)
|-- Cabin: string (nullable = true)
|-- Embarked: string (nullable = true)
*/
//统计Cabin取值分布情况
/*
scala> titanicDFCsv.groupBy("Cabin").agg(count("Cabin") as "cnts").sort(desc("cnts")).show()
+-----------+----+
| Cabin|cnts|
+-----------+----+
| G6| 4|
|C23 C25 C27| 4|
| B96 B98| 4|
| C22 C26| 3|
| D| 3|
| E101| 3|
| F2| 3|
| F33| 3|
| B22| 2|
| C65| 2|
| B18| 2|
| E8| 2|
| D35| 2|
| C126| 2|
|B51 B53 B55| 2|
| B20| 2|
| F G73| 2|
| C78| 2|
| B58 B60| 2|
| C83| 2|
+-----------+----+
only showing top 20 rows
*/
titanicDFCsv.filter("Cabin is null").select("Cabin").show()
/*
scala> titanicDFCsv.filter("Cabin is null").select("Cabin").count()
res7: Long = 687
*/
//统计Cabin字段缺失值个数
/*
scala> titanicDFCsv.filter($"Cabin".isNull).count()
res46: Long = 687
*/
scala> titanicDFCsv.filter($"Cabin".isNull).show
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass| Name| Sex| Age|SibSp|Parch| Ticket| Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
| 1| 0| 3|Braund, Mr. Owen ...| male|22.0| 1| 0| A/5 21171| 7.25| null| S|
| 3| 1| 3|Heikkinen, Miss. ...|female|26.0| 0| 0|STON/O2. 3101282| 7.925| null| S|
| 5| 0| 3|Allen, Mr. Willia...| male|35.0| 0| 0| 373450| 8.05| null| S|
| 6| 0| 3| Moran, Mr. James| male|null| 0| 0| 330877| 8.4583| null| Q|
| 8| 0| 3|Palsson, Master. ...| male| 2.0| 3| 1| 349909| 21.075| null| S|
| 9| 1| 3|Johnson, Mrs. Osc...|female|27.0| 0| 2| 347742|11.1333| null| S|
| 10| 1| 2|Nasser, Mrs. Nich...|female|14.0| 1| 0| 237736|30.0708| null| C|
| 13| 0| 3|Saundercock, Mr. ...| male|20.0| 0| 0| A/5. 2151| 8.05| null| S|
| 14| 0| 3|Andersson, Mr. An...| male|39.0| 1| 5| 347082| 31.275| null| S|
| 15| 0| 3|Vestrom, Miss. Hu...|female|14.0| 0| 0| 350406| 7.8542| null| S|
| 16| 1| 2|Hewlett, Mrs. (Ma...|female|55.0| 0| 0| 248706| 16.0| null| S|
| 17| 0| 3|Rice, Master. Eugene| male| 2.0| 4| 1| 382652| 29.125| null| Q|
| 18| 1| 2|Williams, Mr. Cha...| male|null| 0| 0| 244373| 13.0| null| S|
| 19| 0| 3|Vander Planke, Mr...|female|31.0| 1| 0| 345763| 18.0| null| S|
| 20| 1| 3|Masselmani, Mrs. ...|female|null| 0| 0| 2649| 7.225| null| C|
| 21| 0| 2|Fynney, Mr. Joseph J| male|35.0| 0| 0| 239865| 26.0| null| S|
| 23| 1| 3|"McGowan, Miss. A...|female|15.0| 0| 0| 330923| 8.0292| null| Q|
| 25| 0| 3|Palsson, Miss. To...|female| 8.0| 3| 1| 349909| 21.075| null| S|
| 26| 1| 3|Asplund, Mrs. Car...|female|38.0| 1| 5| 347077|31.3875| null| S|
| 27| 0| 3|Emir, Mr. Farred ...| male|null| 0| 0| 2631| 7.225| null| C|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
only showing top 20 rows
titanicDFCsv.select(titanicDFCsv.columns.map(c => sum(col(c).isNull.cast("int")).alias(c)): _*).show
/*
scala> titanicDFCsv.select(titanicDFCsv.columns.map(c => sum(col(c).isNull.cast("int")).alias(c)): _*).show
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
| 0| 0| 0| 0| 0|177| 0| 0| 0| 0| 687| 2|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
*/
//对于数值型变量,既可以使用$"Age" > 21, 也可以使用"Age > 21"
titanicDFCsv.filter($"Age" > 21).show()
/*
scala> titanicDFCsv.filter("Age > 21").show()
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass| Name| Sex| Age|SibSp|Parch| Ticket| Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
| 1| 0| 3|Braund, Mr. Owen ...| male|22.0| 1| 0| A/5 21171| 7.25| null| S|
| 2| 1| 1|Cumings, Mrs. Joh...|female|38.0| 1| 0| PC 17599|71.2833| C85| C|
| 3| 1| 3|Heikkinen, Miss. ...|female|26.0| 0| 0|STON/O2. 3101282| 7.925| null| S|
| 4| 1| 1|Futrelle, Mrs. Ja...|female|35.0| 1| 0| 113803| 53.1| C123| S|
| 5| 0| 3|Allen, Mr. Willia...| male|35.0| 0| 0| 373450| 8.05| null| S|
| 7| 0| 1|McCarthy, Mr. Tim...| male|54.0| 0| 0| 17463|51.8625| E46| S|
| 9| 1| 3|Johnson, Mrs. Osc...|female|27.0| 0| 2| 347742|11.1333| null| S|
| 12| 1| 1|Bonnell, Miss. El...|female|58.0| 0| 0| 113783| 26.55| C103| S|
| 14| 0| 3|Andersson, Mr. An...| male|39.0| 1| 5| 347082| 31.275| null| S|
| 16| 1| 2|Hewlett, Mrs. (Ma...|female|55.0| 0| 0| 248706| 16.0| null| S|
| 19| 0| 3|Vander Planke, Mr...|female|31.0| 1| 0| 345763| 18.0| null| S|
| 21| 0| 2|Fynney, Mr. Joseph J| male|35.0| 0| 0| 239865| 26.0| null| S|
| 22| 1| 2|Beesley, Mr. Lawr...| male|34.0| 0| 0| 248698| 13.0| D56| S|
| 24| 1| 1|Sloper, Mr. Willi...| male|28.0| 0| 0| 113788| 35.5| A6| S|
| 26| 1| 3|Asplund, Mrs. Car...|female|38.0| 1| 5| 347077|31.3875| null| S|
| 31| 0| 1|Uruchurtu, Don. M...| male|40.0| 0| 0| PC 17601|27.7208| null| C|
| 34| 0| 2|Wheadon, Mr. Edwa...| male|66.0| 0| 0| C.A. 24579| 10.5| null| S|
| 35| 0| 1|Meyer, Mr. Edgar ...| male|28.0| 1| 0| PC 17604|82.1708| null| C|
| 36| 0| 1|Holverson, Mr. Al...| male|42.0| 1| 0| 113789| 52.0| null| S|
| 41| 0| 3|Ahlin, Mrs. Johan...|female|40.0| 1| 0| 7546| 9.475| null| S|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
only showing top 20 rows
*/
//此处的等号有2个,注意跟"||"、&&结合使用时候等号的差别(有三个等号)
/*
scala> titanicDFCsv.filter("Age == 21").show()
+-----------+--------+------+--------------------+------+----+-----+-----+-----------------+-------+-----+--------+
|PassengerId|Survived|Pclass| Name| Sex| Age|SibSp|Parch| Ticket| Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+-----------------+-------+-----+--------+
| 38| 0| 3|Cann, Mr. Ernest ...| male|21.0| 0| 0| A./5. 2152| 8.05| null| S|
| 52| 0| 3|Nosworthy, Mr. Ri...| male|21.0| 0| 0| A/4. 39886| 7.8| null| S|
| 57| 1| 2| Rugg, Miss. Emily|female|21.0| 0| 0| C.A. 31026| 10.5| null| S|
| 73| 0| 2|Hood, Mr. Ambrose Jr| male|21.0| 0| 0| S.O.C. 14879| 73.5| null| S|
| 103| 0| 1|White, Mr. Richar...| male|21.0| 0| 1| 35281|77.2875| D26| S|
| 107| 1| 3|Salkjelsvik, Miss...|female|21.0| 0| 0| 343120| 7.65| null| S|
| 116| 0| 3|Pekoniemi, Mr. Ed...| male|21.0| 0| 0|STON/O 2. 3101294| 7.925| null| S|
| 121| 0| 2|Hickman, Mr. Stan...| male|21.0| 2| 0| S.O.C. 14879| 73.5| null| S|
| 174| 0| 3|Sivola, Mr. Antti...| male|21.0| 0| 0|STON/O 2. 3101280| 7.925| null| S|
| 392| 1| 3|Jansson, Mr. Carl...| male|21.0| 0| 0| 350034| 7.7958| null| S|
| 403| 0| 3|Jussila, Miss. Ma...|female|21.0| 1| 0| 4137| 9.825| null| S|
| 409| 0| 3|Birkeland, Mr. Ha...| male|21.0| 0| 0| 312992| 7.775| null| S|
| 422| 0| 3| Charters, Mr. David| male|21.0| 0| 0| A/5. 13032| 7.7333| null| Q|
| 437| 0| 3|"Ford, Miss. Dool...|female|21.0| 2| 2| W./C. 6608| 34.375| null| S|
| 492| 0| 3| Windelov, Mr. Einar| male|21.0| 0| 0| SOTON/OQ 3101317| 7.25| null| S|
| 495| 0| 3|Stanley, Mr. Edwa...| male|21.0| 0| 0| A/4 45380| 8.05| null| S|
| 502| 0| 3| Canavan, Miss. Mary|female|21.0| 0| 0| 364846| 7.75| null| Q|
| 624| 0| 3|Hansen, Mr. Henry...| male|21.0| 0| 0| 350029| 7.8542| null| S|
| 625| 0| 3|"Bowen, Mr. David...| male|21.0| 0| 0| 54636| 16.1| null| S|
| 628| 1| 1|Longley, Miss. Gr...|female|21.0| 0| 0| 13502|77.9583| D9| S|
+-----------+--------+------+--------------------+------+----+-----+-----+-----------------+-------+-----+--------+
only showing top 20 rows
*/
//对字符型变量进行过滤
scala> titanicDFCsv.filter($"Cabin".equalTo("E101")).show()
+-----------+--------+------+--------------------+------+----+-----+-----+------+-----+-----+--------+
|PassengerId|Survived|Pclass| Name| Sex| Age|SibSp|Parch|Ticket| Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+------+-----+-----+--------+
| 124| 1| 2| Webber, Miss. Susan|female|32.5| 0| 0| 27267| 13.0| E101| S|
| 304| 1| 2| Keane, Miss. Nora A|female|null| 0| 0|226593|12.35| E101| Q|
| 718| 1| 2|"Troutt, Miss. Ed...|female|27.0| 0| 0| 34218| 10.5| E101| S|
+-----------+--------+------+--------------------+------+----+-----+-----+------+-----+-----+--------+
//逻辑连接符 &&(并)、||(或)
/*
scala> titanicDFCsv.filter($"Cabin".equalTo("E101")&&$"Age">21).show()
+-----------+--------+------+--------------------+------+----+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass| Name| Sex| Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+------+----+-----+--------+
| 124| 1| 2| Webber, Miss. Susan|female|32.5| 0| 0| 27267|13.0| E101| S|
| 718| 1| 2|"Troutt, Miss. Ed...|female|27.0| 0| 0| 34218|10.5| E101| S|
+-----------+--------+------+--------------------+------+----+-----+-----+------+----+-----+--------+
*/
//统计Cabin取值为"E101"且Age取值为27的记录
/*
scala> titanicDFCsv.filter($"Cabin".equalTo("E101")&&$"Age"===27.0).show()
+-----------+--------+------+--------------------+------+----+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass| Name| Sex| Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+------+----+-----+--------+
| 718| 1| 2|"Troutt, Miss. Ed...|female|27.0| 0| 0| 34218|10.5| E101| S|
+-----------+--------+------+--------------------+------+----+-----+-----+------+----+-----+--------+
*/
//统计Cabin取值为"E101"或Age取值为27的记录,注意数值型变量有三个等号
/*
scala> titanicDFCsv.filter($"Cabin".equalTo("E101")||$"Age"===27.0).show()
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass| Name| Sex| Age|SibSp|Parch| Ticket| Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
| 9| 1| 3|Johnson, Mrs. Osc...|female|27.0| 0| 2| 347742|11.1333| null| S|
| 42| 0| 2|Turpin, Mrs. Will...|female|27.0| 1| 0| 11668| 21.0| null| S|
| 124| 1| 2| Webber, Miss. Susan|female|32.5| 0| 0| 27267| 13.0| E101| S|
| 147| 1| 3|"Andersson, Mr. A...| male|27.0| 0| 0| 350043| 7.7958| null| S|
| 217| 1| 3|Honkanen, Miss. E...|female|27.0| 0| 0|STON/O2. 3101283| 7.925| null| S|
| 222| 0| 2|Bracken, Mr. James H| male|27.0| 0| 0| 220367| 13.0| null| S|
| 304| 1| 2| Keane, Miss. Nora A|female|null| 0| 0| 226593| 12.35| E101| Q|
| 322| 0| 3| Danoff, Mr. Yoto| male|27.0| 0| 0| 349219| 7.8958| null| S|
| 378| 0| 1|Widener, Mr. Harr...| male|27.0| 0| 2| 113503| 211.5| C82| C|
| 552| 0| 2|Sharp, Mr. Perciv...| male|27.0| 0| 0| 244358| 26.0| null| S|
| 608| 1| 1|Daniel, Mr. Rober...| male|27.0| 0| 0| 113804| 30.5| null| S|
| 621| 0| 3| Yasbeck, Mr. Antoni| male|27.0| 1| 0| 2659|14.4542| null| C|
| 682| 1| 1| Hassab, Mr. Hammad| male|27.0| 0| 0| PC 17572|76.7292| D49| C|
| 718| 1| 2|"Troutt, Miss. Ed...|female|27.0| 0| 0| 34218| 10.5| E101| S|
| 725| 1| 1|Chambers, Mr. Nor...| male|27.0| 1| 0| 113806| 53.1| E8| S|
| 805| 1| 3|Hedman, Mr. Oskar...| male|27.0| 0| 0| 347089| 6.975| null| S|
| 822| 1| 3| Lulic, Mr. Nikola| male|27.0| 0| 0| 315098| 8.6625| null| S|
| 824| 1| 3| Moor, Mrs. (Beila)|female|27.0| 0| 1| 392096| 12.475| E121| S|
| 867| 1| 2|Duran y More, Mis...|female|27.0| 1| 0| SC/PARIS 2149|13.8583| null| C|
| 887| 0| 2|Montvila, Rev. Ju...| male|27.0| 0| 0| 211536| 13.0| null| S|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
*/
//统计Cabin取值为空或Age取值为空的记录
/*
scala> titanicDFCsv.filter($"Cabin".isNull||$"Age".isNull).count()
res47: Long = 706
*/
转载于:https://my.oschina.net/kyo4321/blog/2994649