增加了特征标准化程序
package class6
import org.apache.spark.mllib.clustering.{KMeansModel, KMeans}
import org.apache.spark.mllib.feature.StandardScaler
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.sql.Row
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkContext, SparkConf}
/**
* Created by root on 16-1-22.
* 零售户按年库存量、销售量进行聚类
* 两表join出现了数据重叠,考虑每次读一张表,利用RDD的join方法
* 得到特征矩阵。。。
*--------------------------------------------------------------------Within Set Sum of Squared Errors=-----------------------------------------2.6105260195375473E10
*
*/
object tobacco_kmeans {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("tobacco_kmeans")//.setMaster("local[4]")
val sc = new SparkContext(sparkConf)
val HiveContext = new HiveContext(sc)
import HiveContext._
/*
销售数据
*/
val saledata = sql("select com_name ,sum(qty_ord) sale_qty from hhsale_data where puh_time is " +
"not null group by com_name")
/*
库存数据
*/
val storedata = sql("select com_name ,sum(qty_ord) store_qty from hhstore_data where item_code is not " +
"null and qty_ord >0 group by com_name")
val data=saledata.join(storedata,"com_name")
val parsedData = data.map{
case Row(_, sale_qty, store_qty) =>
val features = Array[Double](sale_qty.toString.toDouble,
store_qty.toString.toDouble)
Vectors.dense(features)
}.cache()//.saveAsTextFile("/class6/data")
/*
特征标准化
*/
val scaler = new StandardScaler(
withMean = true,withStd = true
).fit(parsedData)
val scaledVectors =parsedData.map(v => scaler.transform(v))
/*
不同的迭代次数
*/
// val it:Array[Int] = Array(1,2,3,4,5,6,7,8,9,10)
val it:Array[Int]=Array(10,15,20,25,30,35,40)
it.foreach(it => {
val model:KMeansModel = KMeans.train(scaledVectors, 3,it,2,"random")
val ssd = model.computeCost(scaledVectors)
println("sum of squared distances of points to their nearest center when itr=" + it + " -> "+ ssd)
})
val numClusters = 3
val numIterations = 20
val model = KMeans.train(scaledVectors,numClusters,numIterations,runs = 2)
//打印数据模型的中心点
println("---------------------------------------------------------------" +
"Cluster centers:" +
"---------------------------------------------------------------------")
for(c <-model.clusterCenters){
println(" "+c.toString)
}
//使用误差平方之和来评估数据模型,--------------------------------------模型在训练集上计算损失
val cost=model.computeCost(scaledVectors)
println("--------------------------------------------------------------------" +
"Within Set Sum of Squared Errors=-----------------------------------------"+cost)
用模型对读入的数据进行分类,并输出
//由于 partition 没设置,输出为 200 个小文件,可以使用 bin/hdfs dfs -getmerge 合并
//下载到本地
val result = data.map{
case Row(com_name, sale_qty, store_qty) =>
val features = Array[Double](sale_qty.toString.toDouble,
store_qty.toString.toDouble)
val linevectore = Vectors.dense(features)
//标准化
val scaledline = scaler.transform(linevectore)
val prediction = model.predict(scaledline)
com_name+" "+sale_qty+" "+store_qty+" "+prediction+"\n"
}.saveAsTextFile(args(0))
// val numClusters = 3
// val numIterations = 20
// val model = KMeans.train(parsedData,numClusters,numIterations)
// //打印数据模型的中心点
// println("---------------------------------------------------------------" +
// "Cluster centers:" +
// "---------------------------------------------------------------------")
// for(c <-model.clusterCenters){
// println(" "+c.toString)
// }
//
// //使用误差平方之和来评估数据模型,--------------------------------------模型在训练集上计算损失
//
// val cost=model.computeCost(parsedData)
// println("--------------------------------------------------------------------" +
// "Within Set Sum of Squared Errors=-----------------------------------------"+cost)
// 用模型对读入的数据进行分类,并输出
// //由于 partition 没设置,输出为 200 个小文件,可以使用 bin/hdfs dfs -getmerge 合并
// //下载到本地
// val result = data.map{
// case Row(com_name, sale_qty, store_qty) =>
// val features = Array[Double](sale_qty.toString.toDouble,
// store_qty.toString.toDouble)
// val linevectore = Vectors.dense(features)
// val prediction = model.predict(linevectore)
// com_name+" "+sale_qty+" "+store_qty+" "+prediction+"\n"
// }.saveAsTextFile(args(0))
// val result2 = sqldata.map{
// case Row(com_name,store_qty,sale_qty)=>
// val features =Array[Double](store_qty.toString.toDouble,
// sale_qty.toString.toDouble)
// val linevectore = Vectors.dense(features)
// val prediction = model.predict(linevectore)
// com_name+" "+store_qty+" "+sale_qty+" "+prediction
// }.saveAsTextFile(args(0))
System.out.println("-----------------------------")
sc.stop()
}
}
改变迭代次数,随机选初始点,run2次选初始点评估结果:
1 sum of squared distances of points to their nearest center when itr=10 -> 68.45241255775106
2 sum of squared distances of points to their nearest center when itr=15 -> 68.45241255775106
3 sum of squared distances of points to their nearest center when itr=20 -> 68.45241255775107
4 sum of squared distances of points to their nearest center when itr=25 -> 68.38946484451297
5 sum of squared distances of points to their nearest center when itr=30 -> 69.15875531327036
6 sum of squared distances of points to their nearest center when itr=35 -> 68.5020394304827
7 sum of squared distances of points to their nearest center when itr=40 -> 68.64494935350622
8 ---------------------------------------------------------------Cluster centers:---------------------- -----------------------------------------------
9 [-0.3977231394410828,-0.08638511951423264]
10 [-0.3525021012551558,1.5702237448594607]
11 [1.4603723091512353,-0.8149743960785426]
12 --------------------------------------------------------------------Within Set Sum of Squared Errors= -----------------------------------------68.64494935350622
13 -----------------------------
~
sum of squared distances of points to their nearest center when itr=10 -> 68.45241255775107
sum of squared distances of points to their nearest center when itr=15 -> 68.52141419538006
sum of squared distances of points to their nearest center when itr=20 -> 68.38946484451294
sum of squared distances of points to their nearest center when itr=25 -> 69.15875531327033
sum of squared distances of points to their nearest center when itr=30 -> 68.64494935350619
sum of squared distances of points to their nearest center when itr=35 -> 68.64494935350619
sum of squared distances of points to their nearest center when itr=40 -> 68.45241255775107
随机选初始点,改变run的次数
sum of squared distances of points to their nearest center when run=1 -> 68.64494935350622
sum of squared distances of points to their nearest center when run=2 -> 78.26550176498397
sum of squared distances of points to their nearest center when run=3 -> 68.2653607410948
sum of squared distances of points to their nearest center when run=4 -> 68.52141419538006
sum of squared distances of points to their nearest center when run=5 -> 68.2653607410948
sum of squared distances of points to their nearest center when run=6 -> 68.2653607410948
sum of squared distances of points to their nearest center when run=7 -> 68.38946484451297
sum of squared distances of points to their nearest center when run=8 -> 68.28913722951712
sum of squared distances of points to their nearest center when run=9 -> 68.2653607410948
sum of squared distances of points to their nearest center when run=10 -> 68.26536074109478
-----------------------------
kmeans++选初始点,改变run的次数
1 sum of squared distances of points to their nearest center when run=1 -> 68.28913722951711
2 sum of squared distances of points to their nearest center when run=2 -> 72.48322271456834
3 sum of squared distances of points to their nearest center when run=3 -> 68.45241255775106
4 sum of squared distances of points to their nearest center when run=4 -> 68.6449493535062
5 sum of squared distances of points to their nearest center when run=5 -> 68.26536074109477
6 sum of squared distances of points to their nearest center when run=6 -> 68.45241255775105
7 sum of squared distances of points to their nearest center when run=7 -> 68.45241255775106
8 sum of squared distances of points to their nearest center when run=8 -> 68.26536074109477
9 sum of squared distances of points to their nearest center when run=9 -> 68.26536074109477
10 sum of squared distances of points to their nearest center when run=10 -> 68.28913722951711
11 -----------------------------
~
递归20次,run2次的评估结果:
sum of squared distances of points to their nearest center when itr=10 -> 68.45241255775107
sum of squared distances of points to their nearest center when itr=15 -> 72.28822380601336
sum of squared distances of points to their nearest center when itr=20 -> 72.48322271456834
sum of squared distances of points to their nearest center when itr=25 -> 68.64494935350622
sum of squared distances of points to their nearest center when itr=30 -> 68.38946484451297
sum of squared distances of points to their nearest center when itr=35 -> 68.28913722951712
sum of squared distances of points to their nearest center when itr=40 -> 70.36885180809672
-----------------------------
递归20次,run2次的kmeans++
聚类结果
[094]兴关店 18706 57706.000000 0
[012]贵钢店 63320 40860.000000 0
[056]观水店 28934 81498.000000 1
[043]云阳店 49752 51101.000000 0
[027]湘雅店 116073 32931.000000 2
[077]凤凰翠堤 50637 42147.000000 0
[054]O六一店 55564 62501.000000 1
[073]和平店 37079 60840.000000 1
[075]四方河店 53378 54411.000000 0
[065]比兰德店 76568 48998.000000 2
[051]新威店 44724 51807.000000 0
[085]摩卡店 21160 56871.000000 0
[024]金狮店A 43576 49296.000000 0
[017]欣歆店 32897 59049.000000 0
[046]贝地店 44829 48188.000000 0
[063]金果园店 55705 31281.000000 0
[019]黔灵店 38509 81074.000000 1
[060]筑兴店 73269 30190.000000 2
[036]华阳店 47707 51533.000000 0
[037]小石城 52363 50968.000000 0
[034]交校店 21708 37380.000000 0
[079]万江店 44458 48095.000000 0
[096]鸿通城 38592 50089.000000 0
[072]吉奥店 45314 51248.000000 0
[067]二中店 5713 2870.000000 0
[093]新光店 14395 43151.000000 0
[040]玉田店 73169 31059.000000 2
[076]三桥北店 66542 49079.000000 0
[070]军区店 31491 45743.000000 0
[084]中天店 48581 50627.000000 0
[042]马王店 49864 51891.000000 0
[001]白云一店 94509 60483.000000 2
[055]城基店 46682 55537.000000 0
[068]枣山店 50042 58374.000000 0
[087]警校店 18675 37316.000000 0
[078]世纪新城 33088 38337.000000 0
[069]松竹苑店 44036 46503.000000 0
[050]世纪园店 39734 38059.000000 0
[008]省委店 41826 91960.000000 1
[026]501店 42622 59411.000000 0
[010]教育学院店 82740 33065.000000 2
[035]曦阳店 55683 41977.000000 0
[038]振华店 41467 71864.000000 1
[071]枫丹店 138029 27649.000000 2
[015]清水江店 73585 25374.000000 2
[089]蟠桃宫店 23821 54760.000000 0
[013]瑞和店 33334 43215.000000 0
[022]083店A 43406 56510.000000 0
[049]贵龙店 84094 34512.000000 2
[066]六广门店 33312 63732.000000 1
[029]大理石店 96701 58521.000000 2
[095]叠翠谷店 33754 55371.000000 0
[014]虹桥店 82285 36892.000000 2
[006]月亮岩 65030 52661.000000 0
[098]浦江店 15519 75004.000000 1
[011]凤凰店 41914 50276.000000 0
[086]东新店 33324 56329.000000 0
[081]福楼旺邸店 41874 50275.000000 0
[062]头桥店 53363 53378.000000 0
[041]万东店 100295 32096.000000 2
[007]家乐店 60275 51752.000000 0
[028]威清店 21674 37084.000000 0
[074]十二中店 50059 38261.000000 0
[053]嘉怡店 45155 33315.000000 0
[032]宅吉店 60291 53308.000000 0
[030]东门店 73298 40233.000000 2
[061]太慈店 90692 30359.000000 2
[059]中北店 49589 67715.000000 1
[021]183店 62289 26584.000000 2
[033]新发店 182588 50403.000000 2
[020]贵医店 50190 64561.000000 1
[047]宝山南店 48689 46983.000000 0
[090]保利温泉店 29566 53804.000000 0
[002]白云二店 72947 61744.000000 1
[092]龙宇店 29702 52608.000000 0
[025]宏福店 16933 48769.000000 0