spark里面有很多分类算法
逻辑回归(linear models (SVMs, logistic regression, linear regression)naive Bayes decision trees等
这里我采用ML来做里面的分类算法 支持LR ,决策树、还有NaiveBayesModel等
获取训练数据,(已知性别的数据在hive里面,所以从中获取数据)表我就一table代替了
val tainDataDF = getKnownSex(hiveContext, pathsave)
def getKnownSex(hiveContext: HiveContext, path: String) = {
hiveContext.setConf("mapred.output.compress", "false")
hiveContext.setConf("hive.exec.compress.output", "false")
hiveContext.setConf("mapreduce.output.fileoutputformat.compress", "false")
val knowImei = hiveContext.sql("select imei,sex from table1")
println("====")
val unkownImei = hiveContext.sql("select imei,feature from table1 where stat_date=xxx")
//取得训练的数据
val tainDataDF = knowImei.join(unkownImei, knowImei("imei") === unkownImei("imei"), "left").select("sex", "feature")
val rrr = tainDataDF.select()
// val tainDataDF2 = tainDataDF.where((tainDataDF("feature") !== ""))
val tainDataDF2 = tainDataDF.where((tainDataDF("feature").isNotNull))
tainDataDF2
}
把DF数据保存在hdfs上
val trandatas = tainDataDF.map {
case Row(lable: Int, features: String) =>
(lable, features)
}
然后把数据处理成决策树可以训练的数据 Lablepoint 然后转化成DF
def loadDatasets(sc: SparkContext, path: String) = {
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._
val datas = Utils.loadSVMFile(sc, path, -1)
// val splits: Array[RDD[LabeledPoint]] = datas.randomSplit(Array(0.8, 0.2))
// val dataframes = splits.map(_.toDF())
/*.map {
line =>
line.withColumn("lableString", line("label").cast(StringType))
}*/
val dataframes = datas.toDF()
val numFeatures = dataframes.select("features").first().getAs[Vector](0).size
println(s"特征数据量为 $numFeatures")
datas
}
对数据进行训练
//val datas=trainingdrops
val stages = new mutable.ArrayBuffer[PipelineStage]()
val lableIndexer = new StringIndexer().setInputCol("label").setOutputCol("indexedLablel").fit(datas)
//lableIndexer.t
stages += lableIndexer
/* val featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(4).fit(datas)
stages += featureIndexer*/
val scaler = new StandardScaler().setInputCol("features").setOutputCol("scaledFeatures").setWithStd(true).setWithMean(false)
//正则化 特征值
val scalerModel = scaler.fit(datas)
stages += scalerModel
//数据划分
//trainingdrop
val splits = datas.randomSplit(Array(0.8, 0.2))
val training = splits(0)
val test = splits(1)
/* test.show(10)
val test2 = datas.map {
case Row(label: Double, features: Vector) =>
label
}.countByValue()*/
val sexModel = new DecisionTreeClassifier().setFeaturesCol("scaledFeatures").setLabelCol("indexedLablel").setMaxDepth(15).setMaxBins(100)
//sexModel.tr
//val sexModel= new LogisticRegression().setFeaturesCol("scaledFeatures").setLabelCol("indexedLablel").setRegParam(0.01)setMaxIter(30)
// val sexModel = new DecisionTreeClassifier().setFeaturesCol("features").setLabelCol("label").setMaxDepth(10).setMaxBins(10)
//datas.printSchema()
stages += sexModel
val pipline = new Pipeline().setStages(stages.toArray)
val startime = System.nanoTime()
val pipelineModel = pipline.fit(training)
training.printSchema()
// Make predictions.
val predictions = pipelineModel.transform(test)
对结果进行预测然后比较准确率
def evaluateModel(model: Transformer,
data: DataFrame,
lableColName: String) = {
//val testdataPrediction = pipelineModel.transform(test)
val testdataPrediction = model.transform(data)
val predictions = testdataPrediction.select("prediction").map(_.getDouble(0))
val lables = testdataPrediction.select(lableColName).map(_.getDouble(0))
val lablesAndpredictions = predictions.zip(lables)
val metrics = new BinaryClassificationMetrics(lablesAndpredictions)
val auc = metrics.areaUnderROC()
val accs = lablesAndpredictions.map {
line =>
if (line._1 == line._2) 1 else 0
}.sum
val acc = accs / lablesAndpredictions.count()
(auc, acc)
//aucc: (Double, Double) = (0.5849461445602667,0.924936788874842)
// println(s"Model AUC: $auc")
}
。。。。一些细节可以私聊了。。。