spark--案例分享--性别预测

最新推荐文章于 2022-12-07 09:40:12 发布

xiaokekehaha19

最新推荐文章于 2022-12-07 09:40:12 发布

阅读量3.3k

点赞数

分类专栏： spark 文章标签： spark

本文链接：https://blog.csdn.net/xiaokekehaha19/article/details/48785633

版权

spark 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

spark里面有很多分类算法

逻辑回归（linear models (SVMs, logistic regression, linear regression)naive Bayes decision trees等

这里我采用ML来做里面的分类算法支持LR ,决策树、还有NaiveBayesModel等

获取训练数据，（已知性别的数据在hive里面，所以从中获取数据）表我就一table代替了

val tainDataDF = getKnownSex(hiveContext, pathsave)

 def getKnownSex(hiveContext: HiveContext, path: String) = {
    hiveContext.setConf("mapred.output.compress", "false")
    hiveContext.setConf("hive.exec.compress.output", "false")
    hiveContext.setConf("mapreduce.output.fileoutputformat.compress", "false")
    val knowImei = hiveContext.sql("select imei,sex from table1")
    println("====")
    val unkownImei = hiveContext.sql("select  imei,feature  from  table1  where stat_date=xxx")
    //取得训练的数据
    val tainDataDF = knowImei.join(unkownImei, knowImei("imei") === unkownImei("imei"), "left").select("sex", "feature")
    val rrr = tainDataDF.select()
    // val tainDataDF2 = tainDataDF.where((tainDataDF("feature") !== ""))
    val tainDataDF2 = tainDataDF.where((tainDataDF("feature").isNotNull))
    tainDataDF2
  }

把DF数据保存在hdfs上

val trandatas = tainDataDF.map {
      case Row(lable: Int, features: String) =>
        (lable, features)
    }

然后把数据处理成决策树可以训练的数据 Lablepoint 然后转化成DF

  def loadDatasets(sc: SparkContext, path: String) = {
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._
    val datas = Utils.loadSVMFile(sc, path, -1)
    // val splits: Array[RDD[LabeledPoint]] = datas.randomSplit(Array(0.8, 0.2))
    // val dataframes = splits.map(_.toDF())
    /*.map {
      line =>
        line.withColumn("lableString", line("label").cast(StringType))
    }*/
    val dataframes = datas.toDF()
    val numFeatures = dataframes.select("features").first().getAs[Vector](0).size
    println(s"特征数据量为 $numFeatures")
    datas
  }

对数据进行训练

//val datas=trainingdrops
    val stages = new mutable.ArrayBuffer[PipelineStage]()
    val lableIndexer = new StringIndexer().setInputCol("label").setOutputCol("indexedLablel").fit(datas)
    //lableIndexer.t
    stages += lableIndexer
    /* val featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(4).fit(datas)
     stages += featureIndexer*/

    val scaler = new StandardScaler().setInputCol("features").setOutputCol("scaledFeatures").setWithStd(true).setWithMean(false)
    //正则化 特征值
    val scalerModel = scaler.fit(datas)
    stages += scalerModel
    //数据划分
    //trainingdrop
    val splits = datas.randomSplit(Array(0.8, 0.2))
    val training = splits(0)
    val test = splits(1)
/*    test.show(10)
    val test2 = datas.map {
      case Row(label: Double, features: Vector) =>
        label
    }.countByValue()*/
     val sexModel = new DecisionTreeClassifier().setFeaturesCol("scaledFeatures").setLabelCol("indexedLablel").setMaxDepth(15).setMaxBins(100)
    //sexModel.tr
     //val sexModel= new LogisticRegression().setFeaturesCol("scaledFeatures").setLabelCol("indexedLablel").setRegParam(0.01)setMaxIter(30)
    // val sexModel = new DecisionTreeClassifier().setFeaturesCol("features").setLabelCol("label").setMaxDepth(10).setMaxBins(10)
    //datas.printSchema()
    stages += sexModel
    val pipline = new Pipeline().setStages(stages.toArray)
    val startime = System.nanoTime()

    val pipelineModel = pipline.fit(training)
    training.printSchema()
    // Make predictions.
    val predictions = pipelineModel.transform(test)

对结果进行预测然后比较准确率

def evaluateModel(model: Transformer,
                    data: DataFrame,
                    lableColName: String) = {
    //val testdataPrediction = pipelineModel.transform(test)
    val testdataPrediction = model.transform(data)
    val predictions = testdataPrediction.select("prediction").map(_.getDouble(0))
    val lables = testdataPrediction.select(lableColName).map(_.getDouble(0))
    val lablesAndpredictions = predictions.zip(lables)
    val metrics = new BinaryClassificationMetrics(lablesAndpredictions)
    val auc = metrics.areaUnderROC()
    val accs = lablesAndpredictions.map {
      line =>
        if (line._1 == line._2) 1 else 0
    }.sum
    val acc = accs / lablesAndpredictions.count()
    (auc, acc)
    //aucc: (Double, Double) = (0.5849461445602667,0.924936788874842)
    //  println(s"Model AUC: $auc")
  }

。。。。一些细节可以私聊了。。。

xiaokekehaha19

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
spark--案例分享--性别预测

spark里面有很多分类算法逻辑回归（linear models (SVMs, logistic regression, linear regression)naive Bayes decision trees等这里我采用ML来做里面的分类算法支持LR ,决策树、还有NaiveBayesModel等获取训练数据，（已知性别的数据在hive里面，所以从中获取数据）表我就一table代替
复制链接

扫一扫