零售户聚类改善

增加了特征标准化程序

package class6

import org.apache.spark.mllib.clustering.{KMeansModel, KMeans}
import org.apache.spark.mllib.feature.StandardScaler
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.sql.Row
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkContext, SparkConf}

/**
 * Created by root on 16-1-22.
 * 零售户按年库存量、销售量进行聚类
 * 两表join出现了数据重叠,考虑每次读一张表,利用RDD的join方法
 * 得到特征矩阵。。。
 *--------------------------------------------------------------------Within Set Sum of Squared Errors=-----------------------------------------2.6105260195375473E10

 *
 */
object tobacco_kmeans {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("tobacco_kmeans")//.setMaster("local[4]")
    val sc = new SparkContext(sparkConf)
    val HiveContext = new HiveContext(sc)
    import HiveContext._
    /*
    销售数据
     */
    val saledata = sql("select com_name ,sum(qty_ord) sale_qty from hhsale_data where puh_time is " +
      "not null group by com_name")
    /*
    库存数据
     */

    val storedata = sql("select com_name ,sum(qty_ord) store_qty from hhstore_data where item_code is not " +
      "null and qty_ord >0 group by com_name")
    val data=saledata.join(storedata,"com_name")
    val parsedData = data.map{
          case Row(_, sale_qty, store_qty) =>
            val features = Array[Double](sale_qty.toString.toDouble,
              store_qty.toString.toDouble)
            Vectors.dense(features)
        }.cache()//.saveAsTextFile("/class6/data")
   /*
   特征标准化
    */
        val scaler = new StandardScaler(
        withMean = true,withStd = true
        ).fit(parsedData)
       val scaledVectors =parsedData.map(v => scaler.transform(v))

    /*
      不同的迭代次数
     */
  //  val it:Array[Int] = Array(1,2,3,4,5,6,7,8,9,10)
    val it:Array[Int]=Array(10,15,20,25,30,35,40)
    it.foreach(it => {
      val model:KMeansModel = KMeans.train(scaledVectors, 3,it,2,"random")
      val ssd = model.computeCost(scaledVectors)
      println("sum of squared distances of points to their nearest center when itr=" + it + " -> "+ ssd)
    })


        val numClusters = 3
        val numIterations = 20
        val model = KMeans.train(scaledVectors,numClusters,numIterations,runs = 2)
        //打印数据模型的中心点
        println("---------------------------------------------------------------" +
          "Cluster centers:" +
          "---------------------------------------------------------------------")
        for(c <-model.clusterCenters){
          println(" "+c.toString)
        }

        //使用误差平方之和来评估数据模型,--------------------------------------模型在训练集上计算损失

        val cost=model.computeCost(scaledVectors)
        println("--------------------------------------------------------------------" +
          "Within Set Sum of Squared Errors=-----------------------------------------"+cost)
        用模型对读入的数据进行分类,并输出
        //由于 partition 没设置,输出为 200 个小文件,可以使用 bin/hdfs dfs -getmerge 合并
        //下载到本地
        val result = data.map{
          case Row(com_name, sale_qty, store_qty) =>
            val features = Array[Double](sale_qty.toString.toDouble,
              store_qty.toString.toDouble)
            val linevectore = Vectors.dense(features)
            //标准化
            val scaledline = scaler.transform(linevectore)

            val prediction = model.predict(scaledline)
            com_name+" "+sale_qty+" "+store_qty+" "+prediction+"\n"
        }.saveAsTextFile(args(0))





//    val numClusters = 3
//    val numIterations = 20
//    val model = KMeans.train(parsedData,numClusters,numIterations)
//    //打印数据模型的中心点
//    println("---------------------------------------------------------------" +
//      "Cluster centers:" +
//      "---------------------------------------------------------------------")
//    for(c <-model.clusterCenters){
//      println(" "+c.toString)
//    }
//
//    //使用误差平方之和来评估数据模型,--------------------------------------模型在训练集上计算损失
//
//    val cost=model.computeCost(parsedData)
//    println("--------------------------------------------------------------------" +
//      "Within Set Sum of Squared Errors=-----------------------------------------"+cost)
//    用模型对读入的数据进行分类,并输出
//    //由于 partition 没设置,输出为 200 个小文件,可以使用 bin/hdfs dfs -getmerge 合并
//    //下载到本地
//    val result = data.map{
//      case Row(com_name, sale_qty, store_qty) =>
//        val features = Array[Double](sale_qty.toString.toDouble,
//          store_qty.toString.toDouble)
//        val linevectore = Vectors.dense(features)
//        val prediction = model.predict(linevectore)
//        com_name+" "+sale_qty+" "+store_qty+" "+prediction+"\n"
//    }.saveAsTextFile(args(0))




//    val result2  = sqldata.map{
//      case Row(com_name,store_qty,sale_qty)=>
//        val features =Array[Double](store_qty.toString.toDouble,
//          sale_qty.toString.toDouble)
//        val linevectore = Vectors.dense(features)
//        val prediction = model.predict(linevectore)
//        com_name+" "+store_qty+" "+sale_qty+" "+prediction
//    }.saveAsTextFile(args(0))

    System.out.println("-----------------------------")
    sc.stop()
  }


}

改变迭代次数,随机选初始点,run2次选初始点评估结果:

 1 sum of squared distances of points to their nearest center when itr=10 -> 68.45241255775106          
  2 sum of squared distances of points to their nearest center when itr=15 -> 68.45241255775106
  3 sum of squared distances of points to their nearest center when itr=20 -> 68.45241255775107
  4 sum of squared distances of points to their nearest center when itr=25 -> 68.38946484451297
  5 sum of squared distances of points to their nearest center when itr=30 -> 69.15875531327036
  6 sum of squared distances of points to their nearest center when itr=35 -> 68.5020394304827
  7 sum of squared distances of points to their nearest center when itr=40 -> 68.64494935350622
  8 ---------------------------------------------------------------Cluster centers:----------------------    -----------------------------------------------
  9  [-0.3977231394410828,-0.08638511951423264]
 10  [-0.3525021012551558,1.5702237448594607]
 11  [1.4603723091512353,-0.8149743960785426]
 12 --------------------------------------------------------------------Within Set Sum of Squared Errors=    -----------------------------------------68.64494935350622
 13 -----------------------------
~        

sum of squared distances of points to their nearest center when itr=10 -> 68.45241255775107
sum of squared distances of points to their nearest center when itr=15 -> 68.52141419538006
sum of squared distances of points to their nearest center when itr=20 -> 68.38946484451294
sum of squared distances of points to their nearest center when itr=25 -> 69.15875531327033
sum of squared distances of points to their nearest center when itr=30 -> 68.64494935350619
sum of squared distances of points to their nearest center when itr=35 -> 68.64494935350619
sum of squared distances of points to their nearest center when itr=40 -> 68.45241255775107                                

随机选初始点,改变run的次数

sum of squared distances of points to their nearest center when run=1 -> 68.64494935350622
sum of squared distances of points to their nearest center when run=2 -> 78.26550176498397
sum of squared distances of points to their nearest center when run=3 -> 68.2653607410948
sum of squared distances of points to their nearest center when run=4 -> 68.52141419538006
sum of squared distances of points to their nearest center when run=5 -> 68.2653607410948
sum of squared distances of points to their nearest center when run=6 -> 68.2653607410948
sum of squared distances of points to their nearest center when run=7 -> 68.38946484451297
sum of squared distances of points to their nearest center when run=8 -> 68.28913722951712
sum of squared distances of points to their nearest center when run=9 -> 68.2653607410948
sum of squared distances of points to their nearest center when run=10 -> 68.26536074109478
-----------------------------

kmeans++选初始点,改变run的次数

  1 sum of squared distances of points to their nearest center when run=1 -> 68.28913722951711           
  2 sum of squared distances of points to their nearest center when run=2 -> 72.48322271456834
  3 sum of squared distances of points to their nearest center when run=3 -> 68.45241255775106
  4 sum of squared distances of points to their nearest center when run=4 -> 68.6449493535062
  5 sum of squared distances of points to their nearest center when run=5 -> 68.26536074109477
  6 sum of squared distances of points to their nearest center when run=6 -> 68.45241255775105
  7 sum of squared distances of points to their nearest center when run=7 -> 68.45241255775106
  8 sum of squared distances of points to their nearest center when run=8 -> 68.26536074109477
  9 sum of squared distances of points to their nearest center when run=9 -> 68.26536074109477
 10 sum of squared distances of points to their nearest center when run=10 -> 68.28913722951711
 11 -----------------------------
~                                             

递归20次,run2次的评估结果:

sum of squared distances of points to their nearest center when itr=10 -> 68.45241255775107
sum of squared distances of points to their nearest center when itr=15 -> 72.28822380601336
sum of squared distances of points to their nearest center when itr=20 -> 72.48322271456834
sum of squared distances of points to their nearest center when itr=25 -> 68.64494935350622
sum of squared distances of points to their nearest center when itr=30 -> 68.38946484451297
sum of squared distances of points to their nearest center when itr=35 -> 68.28913722951712
sum of squared distances of points to their nearest center when itr=40 -> 70.36885180809672
-----------------------------

递归20次,run2次的kmeans++
聚类结果

[094]兴关店 18706 57706.000000 0

[012]贵钢店 63320 40860.000000 0

[056]观水店 28934 81498.000000 1

[043]云阳店 49752 51101.000000 0

[027]湘雅店 116073 32931.000000 2

[077]凤凰翠堤 50637 42147.000000 0

[054]O六一店 55564 62501.000000 1

[073]和平店 37079 60840.000000 1

[075]四方河店 53378 54411.000000 0

[065]比兰德店 76568 48998.000000 2

[051]新威店 44724 51807.000000 0

[085]摩卡店 21160 56871.000000 0

[024]金狮店A 43576 49296.000000 0

[017]欣歆店 32897 59049.000000 0

[046]贝地店 44829 48188.000000 0

[063]金果园店 55705 31281.000000 0

[019]黔灵店 38509 81074.000000 1

[060]筑兴店 73269 30190.000000 2

[036]华阳店 47707 51533.000000 0

[037]小石城 52363 50968.000000 0

[034]交校店 21708 37380.000000 0

[079]万江店 44458 48095.000000 0

[096]鸿通城 38592 50089.000000 0

[072]吉奥店 45314 51248.000000 0

[067]二中店 5713 2870.000000 0

[093]新光店 14395 43151.000000 0

[040]玉田店 73169 31059.000000 2

[076]三桥北店 66542 49079.000000 0

[070]军区店 31491 45743.000000 0

[084]中天店 48581 50627.000000 0

[042]马王店 49864 51891.000000 0

[001]白云一店 94509 60483.000000 2

[055]城基店 46682 55537.000000 0

[068]枣山店 50042 58374.000000 0

[087]警校店 18675 37316.000000 0

[078]世纪新城 33088 38337.000000 0

[069]松竹苑店 44036 46503.000000 0

[050]世纪园店 39734 38059.000000 0

[008]省委店 41826 91960.000000 1

[026]501店 42622 59411.000000 0

[010]教育学院店 82740 33065.000000 2

[035]曦阳店 55683 41977.000000 0

[038]振华店 41467 71864.000000 1

[071]枫丹店 138029 27649.000000 2

[015]清水江店 73585 25374.000000 2

[089]蟠桃宫店 23821 54760.000000 0

[013]瑞和店 33334 43215.000000 0

[022]083店A 43406 56510.000000 0

[049]贵龙店 84094 34512.000000 2

[066]六广门店 33312 63732.000000 1

[029]大理石店 96701 58521.000000 2

[095]叠翠谷店 33754 55371.000000 0

[014]虹桥店 82285 36892.000000 2

[006]月亮岩 65030 52661.000000 0

[098]浦江店 15519 75004.000000 1

[011]凤凰店 41914 50276.000000 0

[086]东新店 33324 56329.000000 0

[081]福楼旺邸店 41874 50275.000000 0

[062]头桥店 53363 53378.000000 0

[041]万东店 100295 32096.000000 2

[007]家乐店 60275 51752.000000 0

[028]威清店 21674 37084.000000 0

[074]十二中店 50059 38261.000000 0

[053]嘉怡店 45155 33315.000000 0

[032]宅吉店 60291 53308.000000 0

[030]东门店 73298 40233.000000 2

[061]太慈店 90692 30359.000000 2

[059]中北店 49589 67715.000000 1

[021]183店 62289 26584.000000 2

[033]新发店 182588 50403.000000 2

[020]贵医店 50190 64561.000000 1

[047]宝山南店 48689 46983.000000 0

[090]保利温泉店 29566 53804.000000 0

[002]白云二店 72947 61744.000000 1

[092]龙宇店 29702 52608.000000 0

[025]宏福店 16933 48769.000000 0
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值