spark--案例分享--性别预测

spark里面有很多分类算法

逻辑回归(linear models (SVMs, logistic regression, linear regression)naive Bayes decision trees等

这里我采用ML来做里面的分类算法 支持LR ,决策树、还有NaiveBayesModel等

获取训练数据,(已知性别的数据在hive里面,所以从中获取数据)表我就一table代替了

val tainDataDF = getKnownSex(hiveContext, pathsave)

 def getKnownSex(hiveContext: HiveContext, path: String) = {
    hiveContext.setConf("mapred.output.compress", "false")
    hiveContext.setConf("hive.exec.compress.output", "false")
    hiveContext.setConf("mapreduce.output.fileoutputformat.compress", "false")
    val knowImei = hiveContext.sql("select imei,sex from table1")
    println("====")
    val unkownImei = hiveContext.sql("select  imei,feature  from  table1  where stat_date=xxx")
    //取得训练的数据
    val tainDataDF = knowImei.join(unkownImei, knowImei("imei") === unkownImei("imei"), "left").select("sex", "feature")
    val rrr = tainDataDF.select()
    // val tainDataDF2 = tainDataDF.where((tainDataDF("feature") !== ""))
    val tainDataDF2 = tainDataDF.where((tainDataDF("feature").isNotNull))
    tainDataDF2
  }

把DF数据保存在hdfs上  
val trandatas = tainDataDF.map {
      case Row(lable: Int, features: String) =>
        (lable, features)
    }

然后把数据处理成决策树可以训练的数据 Lablepoint 然后转化成DF

  def loadDatasets(sc: SparkContext, path: String) = {
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._
    val datas = Utils.loadSVMFile(sc, path, -1)
    // val splits: Array[RDD[LabeledPoint]] = datas.randomSplit(Array(0.8, 0.2))
    // val dataframes = splits.map(_.toDF())
    /*.map {
      line =>
        line.withColumn("lableString", line("label").cast(StringType))
    }*/
    val dataframes = datas.toDF()
    val numFeatures = dataframes.select("features").first().getAs[Vector](0).size
    println(s"特征数据量为 $numFeatures")
    datas
  }

对数据进行训练

//val datas=trainingdrops
    val stages = new mutable.ArrayBuffer[PipelineStage]()
    val lableIndexer = new StringIndexer().setInputCol("label").setOutputCol("indexedLablel").fit(datas)
    //lableIndexer.t
    stages += lableIndexer
    /* val featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(4).fit(datas)
     stages += featureIndexer*/

    val scaler = new StandardScaler().setInputCol("features").setOutputCol("scaledFeatures").setWithStd(true).setWithMean(false)
    //正则化 特征值
    val scalerModel = scaler.fit(datas)
    stages += scalerModel
    //数据划分
    //trainingdrop
    val splits = datas.randomSplit(Array(0.8, 0.2))
    val training = splits(0)
    val test = splits(1)
/*    test.show(10)
    val test2 = datas.map {
      case Row(label: Double, features: Vector) =>
        label
    }.countByValue()*/
     val sexModel = new DecisionTreeClassifier().setFeaturesCol("scaledFeatures").setLabelCol("indexedLablel").setMaxDepth(15).setMaxBins(100)
    //sexModel.tr
     //val sexModel= new LogisticRegression().setFeaturesCol("scaledFeatures").setLabelCol("indexedLablel").setRegParam(0.01)setMaxIter(30)
    // val sexModel = new DecisionTreeClassifier().setFeaturesCol("features").setLabelCol("label").setMaxDepth(10).setMaxBins(10)
    //datas.printSchema()
    stages += sexModel
    val pipline = new Pipeline().setStages(stages.toArray)
    val startime = System.nanoTime()

    val pipelineModel = pipline.fit(training)
    training.printSchema()
    // Make predictions.
    val predictions = pipelineModel.transform(test)


对结果进行预测然后比较准确率

def evaluateModel(model: Transformer,
                    data: DataFrame,
                    lableColName: String) = {
    //val testdataPrediction = pipelineModel.transform(test)
    val testdataPrediction = model.transform(data)
    val predictions = testdataPrediction.select("prediction").map(_.getDouble(0))
    val lables = testdataPrediction.select(lableColName).map(_.getDouble(0))
    val lablesAndpredictions = predictions.zip(lables)
    val metrics = new BinaryClassificationMetrics(lablesAndpredictions)
    val auc = metrics.areaUnderROC()
    val accs = lablesAndpredictions.map {
      line =>
        if (line._1 == line._2) 1 else 0
    }.sum
    val acc = accs / lablesAndpredictions.count()
    (auc, acc)
    //aucc: (Double, Double) = (0.5849461445602667,0.924936788874842)
    //  println(s"Model AUC: $auc")
  }

。。。。一些细节可以私聊了。。。

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值