Spark2-oneHot编码-标准化-主成分-聚类


  1. // affairs:一年来婚外情的频率
  2. // gender:性别
  3. // age:年龄
  4. // yearsmarried:婚龄
  5. // children:是否有小孩
  6. // religiousness:宗教信仰程度(5分制,1分表示反对,5分表示非常信仰)
  7. // education:学历
  8. // occupation:职业(逆向编号的戈登7种分类)
  9. // rating:对婚姻的自我评分(5分制,1表示非常不幸福,5表示非常幸福)

  10. import org.apache.spark.sql.SparkSession
  11. import org.apache.spark.sql.Dataset
  12. import org.apache.spark.sql.Row
  13. import org.apache.spark.sql.DataFrame
  14. import org.apache.spark.sql.Column
  15. import org.apache.spark.sql.DataFrameReader
  16. import org.apache.spark.rdd.RDD
  17. import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
  18. import org.apache.spark.sql.Encoder
  19. import org.apache.spark.ml.linalg.Vectors
  20. import org.apache.spark.ml.feature.StringIndexer
  21. import org.apache.spark.ml.feature.OneHotEncoder
  22. import org.apache.spark.ml.feature.VectorAssembler
  23. import org.apache.spark.ml.feature.StandardScaler
  24. import org.apache.spark.ml.feature.PCA
  25. import org.apache.spark.ml.clustering.KMeans

  26. scala> val spark = SparkSession.builder().appName("Spark SQL basic example").config("spark.some.config.option", "some-value").getOrCreate()

  27. scala>

  28. scala> // For implicit conversions like converting RDDs to DataFrames
  29. scala> import spark.implicits._


  30. scala> val data: DataFrame = spark.read.format("csv").option("header", true).load("hdfs://ns1/datafile/wangxiao/Affairs.csv")
  31. data: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]

  32. scala>

  33. scala> data.cache
  34. res0: data.type = [affairs: string, gender: string ... 7 more fields]

  35. scala>

  36. scala> data.limit(10).show()





  37. scala>

  38. scala> // 转换字符类型,将Double和String的字段分开放

  39. scala> val data1 = data.select(
  40.      | data("affairs").cast("Double"),
  41.      | data("age").cast("Double"),
  42.      | data("yearsmarried").cast("Double"),
  43.      | data("religiousness").cast("Double"),
  44.      | data("education").cast("Double"),
  45.      | data("occupation").cast("Double"),
  46.      | data("rating").cast("Double"),
  47.      | data("gender").cast("String"),
  48.      | data("children").cast("String"))
  49. data1: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 7 more fields]

  50. scala>

  51. scala> data1.printSchema()
  52. root
  53.  |-- affairs: double (nullable = true)
  54.  |-- age: double (nullable = true)
  55.  |-- yearsmarried: double (nullable = true)
  56.  |-- religiousness: double (nullable = true)
  57.  |-- education: double (nullable = true)
  58.  |-- occupation: double (nullable = true)
  59.  |-- rating: double (nullable = true)
  60.  |-- gender: string (nullable = true)
  61.  |-- children: string (nullable = true)


  62. scala> data1.limit(10).show




  63. scala>

  64. scala> val dataDF = data1
  65. dataDF: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 7 more fields]

  66. scala>

  67. scala> dataDF.cache()
  68. res4: dataDF.type = [affairs: double, age: double ... 7 more fields]

  69. scala>

  70. scala> //###################################

  71. scala> val indexer = new StringIndexer().setInputCol("gender").setOutputCol("genderIndex").fit(dataDF)
  72. indexer: org.apache.spark.ml.feature.StringIndexerModel = strIdx_19a888aff882

  73. scala> val indexed = indexer.transform(dataDF)
  74. indexed: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 8 more fields]

  75. scala> // OneHot编码,注意setDropLast设置为false

  76. scala> val encoder = new OneHotEncoder().setInputCol("genderIndex").setOutputCol("genderVec").setDropLast(false)
  77. encoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_f0f47e0b5b37

  78. scala> val encoded = encoder.transform(indexed)
  79. encoded: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 9 more fields]

  80. scala> encoded.show()



  81. scala>

  82. scala> val indexer1 = new StringIndexer().setInputCol("children").setOutputCol("childrenIndex").fit(encoded)
  83. indexer1: org.apache.spark.ml.feature.StringIndexerModel = strIdx_7e4d8c69b823

  84. scala> val indexed1 = indexer1.transform(encoded)
  85. indexed1: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 10 more fields]

  86. scala> val encoder1 = new OneHotEncoder().setInputCol("childrenIndex").setOutputCol("childrenVec").setDropLast(false)
  87. encoder1: org.apache.spark.ml.feature.OneHotEncoder = oneHot_9a8906781325

  88. scala> val encoded1 = encoder1.transform(indexed1)
  89. encoded1: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 11 more fields]

  90. scala> encoded1.show()



  91. scala>

  92. scala> val encodeDF: DataFrame = encoded1
  93. encodeDF: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 11 more fields]

  94. scala> encodeDF.show()



  95. scala> encodeDF.printSchema()
  96. root
  97.  |-- affairs: double (nullable = true)
  98.  |-- age: double (nullable = true)
  99.  |-- yearsmarried: double (nullable = true)
  100.  |-- religiousness: double (nullable = true)
  101.  |-- education: double (nullable = true)
  102.  |-- occupation: double (nullable = true)
  103.  |-- rating: double (nullable = true)
  104.  |-- gender: string (nullable = true)
  105.  |-- children: string (nullable = true)
  106.  |-- genderIndex: double (nullable = true)
  107.  |-- genderVec: vector (nullable = true)
  108.  |-- childrenIndex: double (nullable = true)
  109.  |-- childrenVec: vector (nullable = true)


  110. scala>

  111. scala> //#################################

  112. scala> val assembler = new VectorAssembler().setInputCols(Array("affairs", "age", "yearsmarried", "religiousness", "education", "occupation", "rating", "genderVec", "childrenVec")).setOutputCol("features")
  113. assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_8ccd528981cd

  114. scala>

  115. scala> val vecDF: DataFrame = assembler.transform(encodeDF)
  116. vecDF: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 12 more fields]

  117. scala> vecDF.select("features").show
  118. 16/11/05 15:56:14 WARN Executor: 1 block locks were not released by TID = 11:
  119. [rdd_17_0]
  120. +--------------------+
  121. | features|
  122. +--------------------+
  123. |[0.0,37.0,10.0,3....|
  124. |[0.0,27.0,4.0,4.0...|
  125. |[0.0,32.0,15.0,1....|
  126. |[0.0,57.0,15.0,5....|
  127. |[0.0,22.0,0.75,2....|
  128. |[0.0,32.0,1.5,2.0...|
  129. |[0.0,22.0,0.75,2....|
  130. |[0.0,57.0,15.0,2....|
  131. |[0.0,32.0,15.0,4....|
  132. |[0.0,22.0,1.5,4.0...|
  133. |[0.0,37.0,15.0,2....|
  134. |[0.0,27.0,4.0,4.0...|
  135. |[0.0,47.0,15.0,5....|
  136. |[0.0,22.0,1.5,2.0...|
  137. |[0.0,27.0,4.0,4.0...|
  138. |[0.0,37.0,15.0,1....|
  139. |[0.0,37.0,15.0,2....|
  140. |[0.0,22.0,0.75,3....|
  141. |[0.0,22.0,1.5,2.0...|
  142. |[0.0,27.0,10.0,2....|
  143. +--------------------+
  144. only showing top 20 rows


  145. scala>

  146. scala> // 标准化--均值标准差

  147. scala> val scaler = new StandardScaler().setInputCol("features").setOutputCol("scaledFeatures").setWithStd(true).setWithMean(true)
  148. scaler: org.apache.spark.ml.feature.StandardScaler = stdScal_2e35fbc29084

  149. scala>

  150. scala> // Compute summary statistics by fitting the StandardScaler.

  151. scala> val scalerModel = scaler.fit(vecDF)
  152. scalerModel: org.apache.spark.ml.feature.StandardScalerModel = stdScal_2e35fbc29084

  153. scala>

  154. scala> // Normalize each feature to have unit standard deviation.

  155. scala> val scaledData: DataFrame = scalerModel.transform(vecDF)
  156. scaledData: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 13 more fields]

  157. scala> // scaledData:DataFrame = [features: vector, scaledFeatures: vector]

  158. scala>

  159. scala> scaledData.select("features", "scaledFeatures").show
  160. 16/11/05 15:56:20 WARN Executor: 1 block locks were not released by TID = 13:
  161. [rdd_17_0]
  162. +--------------------+--------------------+
  163. | features| scaledFeatures|
  164. +--------------------+--------------------+
  165. |[0.0,37.0,10.0,3....|[-0.4413500298573...|
  166. |[0.0,27.0,4.0,4.0...|[-0.4413500298573...|
  167. |[0.0,32.0,15.0,1....|[-0.4413500298573...|
  168. |[0.0,57.0,15.0,5....|[-0.4413500298573...|
  169. |[0.0,22.0,0.75,2....|[-0.4413500298573...|
  170. |[0.0,32.0,1.5,2.0...|[-0.4413500298573...|
  171. |[0.0,22.0,0.75,2....|[-0.4413500298573...|
  172. |[0.0,57.0,15.0,2....|[-0.4413500298573...|
  173. |[0.0,32.0,15.0,4....|[-0.4413500298573...|
  174. |[0.0,22.0,1.5,4.0...|[-0.4413500298573...|
  175. |[0.0,37.0,15.0,2....|[-0.4413500298573...|
  176. |[0.0,27.0,4.0,4.0...|[-0.4413500298573...|
  177. |[0.0,47.0,15.0,5....|[-0.4413500298573...|
  178. |[0.0,22.0,1.5,2.0...|[-0.4413500298573...|
  179. |[0.0,27.0,4.0,4.0...|[-0.4413500298573...|
  180. |[0.0,37.0,15.0,1....|[-0.4413500298573...|
  181. |[0.0,37.0,15.0,2....|[-0.4413500298573...|
  182. |[0.0,22.0,0.75,3....|[-0.4413500298573...|
  183. |[0.0,22.0,1.5,2.0...|[-0.4413500298573...|
  184. |[0.0,27.0,10.0,2....|[-0.4413500298573...|
  185. +--------------------+--------------------+
  186. only showing top 20 rows


  187. scala>

  188. scala> //##########################

  189. scala> // 主成分

  190. scala> val pca = new PCA().setInputCol("scaledFeatures").setOutputCol("pcaFeatures").setK(3).fit(scaledData)
  191. 16/11/05 15:56:21 WARN Executor: 1 block locks were not released by TID = 14:
  192. [rdd_17_0]
  193. 16/11/05 15:56:22 WARN Executor: 1 block locks were not released by TID = 15:
  194. [rdd_17_0]
  195. 16/11/05 15:56:24 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
  196. 16/11/05 15:56:24 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
  197. 16/11/05 15:56:25 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
  198. 16/11/05 15:56:25 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK
  199. pca: org.apache.spark.ml.feature.PCAModel = pca_8569d580d6e4

  200. scala> pca.explainedVariance.values //解释变量方差
  201. res11: Array[Double] = Array(0.28779526464781313, 0.23798543640278289, 0.11742828783633019)

  202. scala> pca.pc //载荷(观测变量与主成分的相关系数)
  203. res12: org.apache.spark.ml.linalg.DenseMatrix =
  204. -0.12034310848156521 0.05153952289637974 0.6678769450480689
  205. -0.42860623714516627 0.05417889891307473 -0.05592377098140197
  206. -0.44404074412877986 0.1926596811059294 -0.017025575192258197
  207. -0.12233707317255231 0.08053139375662526 -0.5093149296300096
  208. -0.14664751606128462 -0.3872166556211308 -0.03406819489501708
  209. -0.145543746024348 -0.43054860653839705 0.07841454709046872
  210. 0.17703994181974803 -0.12792784984216296 -0.5173229755329072
  211. 0.2459668445061567 0.4915809641798787 0.010477548320795945
  212. -0.2459668445061567 -0.4915809641798787 -0.010477548320795945
  213. -0.44420980045271047 0.240652448514566 -0.089356723885704
  214. 0.4442098004527103 -0.24065244851456588 0.08935672388570405

  215. scala> pca.extractParamMap()
  216. res13: org.apache.spark.ml.param.ParamMap =
  217. {
  218.     pca_8569d580d6e4-inputCol: scaledFeatures,
  219.     pca_8569d580d6e4-k: 3,
  220.     pca_8569d580d6e4-outputCol: pcaFeatures
  221. }

  222. scala> pca.params
  223. res14: Array[org.apache.spark.ml.param.Param[_]] = Array(pca_8569d580d6e4__inputCol, pca_8569d580d6e4__k, pca_8569d580d6e4__outputCol)

  224. scala>

  225. scala> val pcaDF: DataFrame = pca.transform(scaledData)
  226. pcaDF: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 14 more fields]

  227. scala> // pcaDF:DataFrame = [features: vector, scaledFeatures: vector,pcaFeatures: vector]

  228. scala> pcaDF.cache()
  229. res15: pcaDF.type = [affairs: double, age: double ... 14 more fields]

  230. scala>

  231. scala> pcaDF.printSchema()
  232. root
  233.  |-- affairs: double (nullable = true)
  234.  |-- age: double (nullable = true)
  235.  |-- yearsmarried: double (nullable = true)
  236.  |-- religiousness: double (nullable = true)
  237.  |-- education: double (nullable = true)
  238.  |-- occupation: double (nullable = true)
  239.  |-- rating: double (nullable = true)
  240.  |-- gender: string (nullable = true)
  241.  |-- children: string (nullable = true)
  242.  |-- genderIndex: double (nullable = true)
  243.  |-- genderVec: vector (nullable = true)
  244.  |-- childrenIndex: double (nullable = true)
  245.  |-- childrenVec: vector (nullable = true)
  246.  |-- features: vector (nullable = true)
  247.  |-- scaledFeatures: vector (nullable = true)
  248.  |-- pcaFeatures: vector (nullable = true)


  249. scala> pcaDF.select("features", "scaledFeatures", "pcaFeatures").show
  250. 16/11/05 15:56:36 WARN Executor: 1 block locks were not released by TID = 18:
  251. [rdd_64_0]
  252. +--------------------+--------------------+--------------------+
  253. | features           | scaledFeatures     | pcaFeatures        |
  254. +--------------------+--------------------+--------------------+
  255. |[0.0,37.0,10.0,3....|[-0.4413500298573...|[0.27828160409293...|
  256. |[0.0,27.0,4.0,4.0...|[-0.4413500298573...|[2.42147114101165...|
  257. |[0.0,32.0,15.0,1....|[-0.4413500298573...|[0.18301418047489...|
  258. |[0.0,57.0,15.0,5....|[-0.4413500298573...|[-2.9795960667914...|
  259. |[0.0,22.0,0.75,2....|[-0.4413500298573...|[1.79299133565688...|
  260. |[0.0,32.0,1.5,2.0...|[-0.4413500298573...|[2.65694237441759...|
  261. |[0.0,22.0,0.75,2....|[-0.4413500298573...|[3.48234503794570...|
  262. |[0.0,57.0,15.0,2....|[-0.4413500298573...|[-2.4215838062079...|
  263. |[0.0,32.0,15.0,4....|[-0.4413500298573...|[-0.6964555195741...|
  264. |[0.0,22.0,1.5,4.0...|[-0.4413500298573...|[2.18771069800414...|
  265. |[0.0,37.0,15.0,2....|[-0.4413500298573...|[-2.4259075891377...|
  266. |[0.0,27.0,4.0,4.0...|[-0.4413500298573...|[-0.7743038356008...|
  267. |[0.0,47.0,15.0,5....|[-0.4413500298573...|[-2.6176149267534...|
  268. |[0.0,22.0,1.5,2.0...|[-0.4413500298573...|[2.95788535193022...|
  269. |[0.0,27.0,4.0,4.0...|[-0.4413500298573...|[2.50146472861263...|
  270. |[0.0,37.0,15.0,1....|[-0.4413500298573...|[-0.5123817022008...|
  271. |[0.0,37.0,15.0,2....|[-0.4413500298573...|[-0.9191740114044...|
  272. |[0.0,22.0,0.75,3....|[-0.4413500298573...|[2.97391491782863...|
  273. |[0.0,22.0,1.5,2.0...|[-0.4413500298573...|[3.17940505267806...|
  274. |[0.0,27.0,10.0,2....|[-0.4413500298573...|[0.74585406839527...|
  275. +--------------------+--------------------+--------------------+
  276. only showing top 20 rows


  277. scala>

  278. scala> //#####################################

  279. scala>

  280. scala> // 注意最大迭代次數和轮廓系数

  281. scala> val KSSE = (2 to 10 by 1).par.toList.map { k =>
  282.      | // 聚类
  283.      | // Trains a k-means model.
  284.      | val kmeans = new KMeans().setK(k).setSeed(1L).setFeaturesCol("pcaFeatures")
  285.      | val model = kmeans.fit(pcaDF)
  286.      |
  287.      | // Evaluate clustering by computing Within Set Sum of Squared Errors.
  288.      | val WSSSE = model.computeCost(pcaDF)
  289.      |
  290.      | (k, WSSSE)
  291.      | }
  292. KSSE: List[(Int, Double)] = List((2,2876.20580405469), (3,1680.6647048004902), (4,1395.7184052948346), (5,1239.9362814229812), (6,999.2793106095127), (7,849.0071338527408), (8,737.8560221633246), (9,771.8211752483357), (10,655.7836351785677))

  293. scala>

  294. scala> KSSE.foreach(println)
  295. (2,2876.20580405469)
  296. (3,1680.6647048004902)
  297. (4,1395.7184052948346)
  298. (5,1239.9362814229812)
  299. (6,999.2793106095127)
  300. (7,849.0071338527408)
  301. (8,737.8560221633246)
  302. (9,771.8211752483357)
  303. (10,655.7836351785677)

来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/29070860/viewspace-2127855/,如需转载,请注明出处,否则将追究法律责任。

转载于:http://blog.itpub.net/29070860/viewspace-2127855/

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值