PySpark机器学习(3)——LR和SVM

本文主要在PySpark环境下实现LR和SVM算法,实现代码如下所示:

1.LR实现代码:

%pyspark

from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression

#1.训练
#1.1 读取数据
trainData = spark.sql("""select * from trainData_XXX""")

#1.2 构造训练数据集
trainingSet = trainData.rdd.map(list).map(lambda x:[float(item) for item in x if item]).map(lambda x:Row(label=x[-1], features=Vectors.dense(x[:-1]))).toDF() 

LR = LogisticRegression(regParam=0.01)
LRModel = LR.fit(trainingSet)

print("相关系数:{}".format(LRModel.coefficients))
print("截距:{}".format(LRModel.intercept))

#2.测试
#2.1读取数据
testData = spark.sql("""select * from testData_XXX""")

#2.2 构造测试数据集
testSet = testData.rdd.map(list).map(lambda x:[float(item) for item in x if item]).map(lambda x:Row(label=x[-1], features=Vectors.dense(x[:-1]))).toDF() 

result = LRModel.transform(testSet)
print(result.show())

#2.3 评估分类效果
total_amount=result.count()
correct_amount = result.filter(result.label==result.prediction).count()
precision_rate = correct_amount/total_amount
print("预测准确率为:{}".format(precision_rate))

positive_precision_amount = result.filter(result.label == 1).filter(result.prediction == 1).count()
negative_precision_amount = result.filter(result.label == 0).filter(result.prediction == 0).count()
positive_false_amount = result.filter(result.label == 1).filter(result.prediction == 0).count()
negative_false_amount = result.filter(result.label== 0).filter(result.prediction == 1).count()

print("正样本预测准确数量:{},负样本预测准确数量:{}".format(positive_precision_amount,negative_precision_amount))

positive_amount = result.filter(result.label == 1).count()
negative_amount = result.filter(result.label == 0).count()

print("正样本数:{},负样本数:{}".format(positive_amount,negative_amount))

print("正样本预测错误数量:{},负样本预测错误数量:{}".format(positive_false_amount,negative_false_amount))

recall_rate1 = positive_precision_amount/positive_amount
recall_rate2 = negative_precision_amount/negative_amount

print("正样本召回率为:{},负样本召回率为:{}".format(recall_rate1,recall_rate2))


2.SVM实现代码:

%pyspark

from pyspark.sql.types import *
from pyspark.mllib.classification import SVMModel
from pyspark.mllib.classification import SVMWithSGD
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.evaluation import BinaryClassificationMetrics

#1.训练模型
#1.1 读取数据,构造训练数据集
data = spark.sql("""select * from trainData_XXX""").rdd.collect()

#需要先将一行行的RDD数据序列化,然后再构造LabeledPoint类型的label和features
trainData = sc.parallelize(data).map(lambda x:LabeledPoint(label=x[-1],features=x[:-1]))

print("训练集数量:{}".format(trainData.count()))
#print(trainData.first().features)

#1.2 训练模型
svm = SVMWithSGD.train(sc.parallelize(trainData.collect()), iterations=20)

#prediction = svm.predict(trainData.first().features)
#print("真实值:{},预测值{}".format(prediction,trainData.first().label))

#2.评估模型训练效果
#2.1 构造测试数据集
data2 = spark.sql("""select * from testData_XXX""").rdd.collect()
testData = sc.parallelize(data2).map(lambda x:LabeledPoint(label=x[-1],features=x[:-1]))
print("测试集数量:{}".format(testData.count()))

#2.2 分类效果评估
#总体预测准确率
svmTotalCorrect = testData.map(lambda x: 1 if svm.predict(x.features) == x.label else 0).sum()
#print("分类准确数:{}".format(svmTotalCorrect))
svmAccuracy = float(svmTotalCorrect)/testData.count()
print("总体预测准确率为{}".format(svmAccuracy))

#AUC计算
scoreAndLabels = testData.map(lambda x:(float(svm.predict(x.features)),x.label))
metrics = BinaryClassificationMetrics(scoreAndLabels)
print('PR值:{:.4f},AUC值:{:.4f}'.format(metrics.areaUnderPR, metrics.areaUnderROC))






©️2020 CSDN 皮肤主题: 大白 设计师:CSDN官方博客 返回首页