- Spark session available as 'spark'.
- Welcome to Spark version 2.0.1
-
- import org.apache.spark.sql.SparkSession
- import org.apache.spark.sql.Dataset
- import org.apache.spark.sql.Row
- import org.apache.spark.sql.DataFrame
- import org.apache.spark.sql.Column
- import org.apache.spark.sql.DataFrameReader
- import org.apache.spark.rdd.RDD
- import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
- import org.apache.spark.sql.Encoder
- import org.apache.spark.ml.linalg.Vectors
- import org.apache.spark.ml.feature.VectorAssembler
- import org.apache.spark.ml.regression.LinearRegression
-
-
- scala> val spark = SparkSession.builder().appName("Spark SQL basic example").config("spark.some.config.option", "some-value").getOrCreate()
- 16/11/05 15:07:06 WARN SparkSession$Builder: Use an existing SparkSession, some configuration may not take effect.
- spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@3300811b
-
- scala>
-
- scala> // For implicit conversions like converting RDDs to DataFrames
-
- scala> import spark.implicits._
- import spark.implicits._
-
- scala>
-
- scala> // Load training data
-
- scala> val data: DataFrame = spark.read.format("csv").option("header", true).load("hdfs://ns1/datafile/wfp.csv")
- data: org.apache.spark.sql.DataFrame = [windSpeed: string, power: string]
-
- scala>
-
- scala> data.cache()
- 16/11/05 15:07:12 WARN CacheManager: Asked to cache already cached data.
- res21: data.type = [windSpeed: string, power: string]
-
- scala>
-
- scala> data.limit(10).show
- 16/11/05 15:07:13 WARN Executor: 1 block locks were not released by TID = 352:
- [rdd_9_0]
- +---------+-----+
- |windSpeed|power|
- +---------+-----+
- | 3 | 20|
- | 3.5 | 30|
- | 4 | 50|
- | 4.5 | 100|
- | 5 | 200|
- | 5.5 | 300|
- | 6 | 400|
- | 6.5 | 500|
- | 7 | 600|
- | 7.5 | 700|
- +---------+-----+
-
-
- scala>
-
- scala> // 字段类型转换,并定义别名
-
- scala> val data1= data.select( ( data("windSpeed").cast("Double")*data("windSpeed").cast("Double")*data("windSpeed").cast("Double") ).as("windSpeed"),data("power").cast("Double") )
- data1: org.apache.spark.sql.DataFrame = [windSpeed: double, power: double]
-
- scala>
-
- scala> data1.limit(10).show
- 16/11/05 15:07:16 WARN Executor: 1 block locks were not released by TID = 353:
- [rdd_9_0]
- +---------+-----+
- |windSpeed|power|
- +---------+-----+
- | 27.0 | 20.0|
- | 42.875 | 30.0|
- | 64.0 | 50.0|
- | 91.125 |100.0|
- | 125.0 |200.0|
- | 166.375 |300.0|
- | 216.0 |400.0|
- | 274.625 |500.0|
- | 343.0 |600.0|
- | 421.875 |700.0|
- +---------+-----+
-
-
- scala>
-
- scala> val data2=data1.filter("power>20 and windSpeed<3500").orderBy("windSpeed")
- data2: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [windSpeed: double, power: double]
-
- scala> data2.limit(10).show
- +---------+-----+
- |windSpeed|power|
- +---------+-----+
- | 42.875 | 30.0|
- | 64.0 | 50.0|
- | 91.125 |100.0|
- | 125.0 |200.0|
- | 166.375 |300.0|
- | 216.0 |400.0|
- | 274.625 |500.0|
- | 343.0 |600.0|
- | 421.875 |700.0|
- | 512.0 |800.0|
- +---------+-----+
-
-
- scala>
- | // 转换成Label和Features格式
-
- scala> val assembler = new VectorAssembler().setInputCols(Array("windSpeed")).setOutputCol("features")
- assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_6f4ca7e2549a
-
- scala>
-
- scala> val output: DataFrame = assembler.transform(data2)
- output: org.apache.spark.sql.DataFrame = [windSpeed: double, power: double ... 1 more field]
-
- scala>
-
- scala> output.printSchema()
- root
- |-- windSpeed: double (nullable = true)
- |-- power: double (nullable = true)
- |-- features: vector (nullable = true)
-
-
- scala>
-
- scala> output.limit(10).show
- +---------+-----+---------+
- |windSpeed|power| features|
- +---------+-----+---------+
- | 42.875 | 30.0| [42.875]|
- | 64.0 | 50.0| [64.0] |
- | 91.125 |100.0| [91.125]|
- | 125.0 |200.0| [125.0] |
- | 166.375 |300.0| [166.375]|
- | 216.0 |400.0| [216.0] |
- | 274.625 |500.0| [274.625]|
- | 343.0 |600.0| [343.0] |
- | 421.875 |700.0| [421.875]|
- | 512.0 |800.0| [512.0] |
- +---------+-----+---------+
-
-
- scala>
-
- scala> val training = output
- training: org.apache.spark.sql.DataFrame = [windSpeed: double, power: double ... 1 more field]
-
- scala>
-
- scala> training.cache()
- res28: training.type = [windSpeed: double, power: double ... 1 more field]
-
- scala>
-
- scala> // 设置线性回归参数
-
- scala> val lr = new LinearRegression().setLabelCol("power").setFeaturesCol("features").setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8).setFitIntercept(true)
- lr: org.apache.spark.ml.regression.LinearRegression = linReg_58dde0c17920
-
- scala>
-
- scala> // Fit the model
-
- scala> val lrModel = lr.fit(training)
- lrModel: org.apache.spark.ml.regression.LinearRegressionModel = linReg_58dde0c17920
-
- scala>
-
- scala> // Print the coefficients and intercept for linear regression
-
- scala> println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")
- Coefficients: [0.5113433522323718] Intercept: 389.0639900098431
-
- scala>
-
- scala> // Summarize the model over the training set and print out some metrics
-
- scala> val trainingSummary = lrModel.summary
- trainingSummary: org.apache.spark.ml.regression.LinearRegressionTrainingSummary = org.apache.spark.ml.regression.LinearRegressionTrainingSummary@3ee2b229
-
- scala> println(s"numIterations: ${trainingSummary.totalIterations}")
- numIterations: 4
-
- scala> println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}")
- objectiveHistory: List(0.4791666666666666, 0.3927565804317654, 0.1589356484493222, 0.08099533778850757)
-
- scala> trainingSummary.residuals.show()
- +-------------------+
- | residuals|
- +-------------------+
- |-380.98783623680606|
- | -371.7899645527149|
- |-335.66015298201796|
- |-252.98190903888957|
- |-174.13874023750395|
- | -99.51415409203543|
- |-29.491658116658186|
- | 35.5452401744534 |
- | 95.21303326712507 |
- | 149.12821364718252|
- | 196.90727380045155|
- | 238.16670621275784|
- | 272.5230033699271 |
- | 299.592657757785 |
- | 318.99216186215745|
- | 280.33800816887015|
- | 233.24668916374844|
- | 177.33469733261836|
- | 112.2185251613057 |
- | 37.51466513563605 |
- +-------------------+
- only showing top 20 rows
-
-
- scala> println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
- RMSE: 231.88264620805631
-
- scala> println(s"r2: ${trainingSummary.r2}")
- r2: 0.8318466871093443
来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/29070860/viewspace-2127853/,如需转载,请注明出处,否则将追究法律责任。
转载于:http://blog.itpub.net/29070860/viewspace-2127853/