- 多分类
-
- 分类算法实战
- 下载数据集:
- 分类算法实战
http://archive.ics.uci.edu/ml/machine-learning-databases/00215/
亚马逊测试数据,数据格式是Weka,后缀名.arff。
-
-
- 算法
-
导入数据集,清洗后分成测试数据和训练数据
选择分类模型,设置算法参数
训练数据(fit),用模型预测数据
预测结果正确率等参数打印
-
-
- 旧代码(mllib库的实现,将来废弃)
-
读取文本数据
//load data from filename
def getData(filename:String)={
val scores = scala.collection.mutable.Map("null"-> 0.0)
val d1 = for (x <- sc.sparkContext.textFile(filename) if (x.length()>0 && !x.startsWith("@"))) yield x
val data = d1.map(x => {
val features = x.split(",")
val label = features(features.size - 1)
var newLabel = 0.0
if (scores.contains(label)){
newLabel = scores.get(label).get
}else{
scores.put(label,scores.size)
newLabel = scores.get(label).get
}
// temp test code
if (newLabel>1.0){
newLabel=0.0
}
var fts = new Array[Double](features.size - 1)
val fs = for (i <- 0 to features.size - 2) {
fts(i) = features(i).toDouble
}
LabeledPoint(newLabel, Vectors.dense(fts))
})
println("valid data counts:"+data.count())
data
}
训练数据
def train(trainData:RDD[LabeledPoint],testData:RDD[LabeledPoint])={
// val lr = new LogisticRegressionWithSGD()
// Fit the model
val lr = new LogisticRegressionWithSGD() //建立模型
lr.optimizer.setNumIterations(500).setUpdater(new SimpleUpdater()).setStepSize(0.001).setMiniBatchFraction(0.02) //模型参数
val trained = lr.run(trainData)
val predicts = trained.predict(testData.map(_.features ))
val ziped = predicts.zip(testData.map(_.label ))
var count=0
for ((a,b) <- ziped){
if (a==b){
count=count+1
}
}
println("predict right:"+count+",total:"+testData.count)
}
主流程
def run(filename:String){
println("amozanClassify is running!")
val data = getData(filename)
val split = data.randomSplit(Array(0.8,0.2))
println("train size:"+split(0).count()+",test size:"+split(1).count())
train(split(0),split(1))
}
-
-
- 新代码(ml库的实现)
-
读取文本数据
数据处理可以和旧代码共用一套处理逻辑,读取文本文件,清洗之后将feature和label构造成LabeledPoint数组,旧的采用RDD,新的生成DataFrame进行处理。
RDD转换成DataFrame:
val traindf = sc.createDataFrame(trainData)
//load data from filename
def getData(filename: String): RDD[LabeledPoint] = {
var scores = scala.collection.mutable.Map("null"-> 0.0)
val orig = for (x <- sc.sparkContext.textFile(filename) if (x.length() > 0 && !x.startsWith("@"))) yield x
val data = orig.map { line =>
{
val features = line.split(",")
val label = features(features.size - 1)
var newLabel = 0.0
if (scores.contains(label)) {
newLabel = scores.get(label).get
} else {
println(s"put label:$label,size:${scores.size}")
scores.put(label, scores.size - 1)
newLabel = scores.get(label).get
}
var fts = new Array[Double](features.size - 1)
val fs = for (i <- 0 to features.size - 2) {
fts(i) = features(i).toDouble
}
LabeledPoint(newLabel, Vectors.dense(fts))
}
}
println(s"numClass count:${scores.size}") //不一定准
println(s"valid data counts:${data.count()}")
data
}
训练数据
def train(trainData:RDD[LabeledPoint],testData:RDD[LabeledPoint])={
// Fit the model
val lr = new LogisticRegression() //建立模型
lr.setMaxIter(50).setRegParam(0.01).setElasticNetParam(0.02)
val traindf = sc.createDataFrame(trainData)
val testdf = sc.createDataFrame(testData)
val trained = lr.fit(traindf)
val predicts = trained.transform(testdf)
val evaluator=new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction")
val lrAccuracy=evaluator.evaluate(predicts)
println(s"Accurucy is :${lrAccuracy}")
println("二项逻辑回归模型系数矩阵: "+trained.coefficientMatrix)
println("二项逻辑回归模型的截距向量: "+trained.interceptVector)
println("类的数量(标签可以使用的值): "+trained.numClasses)
println("模型所接受的特征的数量: "+trained.numFeatures)
//多项式逻辑回归不包含对模型的摘要总结
println(trained.hasSummary)
}
主流程
def run(filename:String){
println("amozanClassify is running!")
val data = getData(filename)
val split = data.randomSplit(Array(0.8,0.2))
println(s"train size:${split(0).count()},test size:${split(1).count()}")
train(split(0),split(1))
}
-
-
- NaiveByes分类
-
示例代码:
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
// Load the data stored in LIBSVM format as a DataFrame.
val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
// Split the data into training and test sets (30% held out for testing)
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3), seed = 1234L)
// Train a NaiveBayes model.
val model = new NaiveBayes()
.fit(trainingData)
// Select example rows to display.
val predictions = model.transform(testData)
predictions.show()
// Select (prediction, true label) and compute test error
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val accuracy = evaluator.evaluate(predictions)
println("Test set accuracy = " + accuracy)