spark-MLlib:决策树

以iris数据集(iris):

from pyspark.ml.linalg import Vector,Vectors
from pyspark.sql import Row
from pyspark.ml import Pipeline
from pyspark.ml.feature import IndexToString,StringIndexer,VectorIndexer
from pyspark.ml.classification import DecisionTreeClassificationModel,DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


def f(x):
    rel = {}
    rel['features'] = Vectors.dense(float(x[0]),float(x[1]),float(x[2]),float(x[3]))
    rel['label'] = str(x[4])
    return rel
 
data = spark.sparkContext.textFile("file:///usr/local/spark/iris.txt").map(lambda line: line.split(',')).map(lambda p: Row(**f(p))).toDF()
 
data.createOrReplaceTempView("iris")
 
df = spark.sql("select * from iris")
 
rel = df.rdd.map(lambda t : str(t[1])+":"+str(t[0])).collect() 
# rel :Iris-setosa:[5.1,3.5,1.4,0.2]

# 分别获取标签列和特征列,并进行重命名
labelIndexer = StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(df)
 
featureIndexer = VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(4).fit(df)
 
# 设置一个labelConverter,目的是把预测的类别重新转化成字符型的。
labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
# 接下来,我们把数据集随机分成训练集和测试集,其中训练集占70%。
trainingData, testData = data.randomSplit([0.7, 0.3])

#------------------------------------------------------------------
# 训练决策树模型,通过setter的方法来设置决策树的参数,也可以用ParamMap来设置
dtClassifier=
DecisionTreeClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures")
# 在pipeline中进行设置
pipelinedClassifier = Pipeline().setStages([labelIndexer, featureIndexer, dtClassifier, labelConverter])
# 训练决策树模型
modelClassifier = pipelinedClassifier.fit(trainingData)
# 进行预测
predictionsClassifier = modelClassifier.transform(testData)
# 查看部分预测的结果
predictionsClassifier.select("predictedLabel", "label", "features").show(20)

预测结果:

+---------------+---------------+-----------------+
| predictedLabel|          label|         features|
+---------------+---------------+-----------------+
|    Iris-setosa|    Iris-setosa|[4.3,3.0,1.1,0.1]|
|    Iris-setosa|    Iris-setosa|[4.6,3.4,1.4,0.3]|
|    Iris-setosa|    Iris-setosa|[4.6,3.6,1.0,0.2]|
|    Iris-setosa|    Iris-setosa|[4.8,3.0,1.4,0.1]|
|    Iris-setosa|    Iris-setosa|[4.8,3.1,1.6,0.2]|
|    Iris-setosa|    Iris-setosa|[4.8,3.4,1.6,0.2]|
|    Iris-setosa|    Iris-setosa|[4.9,3.0,1.4,0.2]|
|    Iris-setosa|    Iris-setosa|[4.9,3.1,1.5,0.1]|
|    Iris-setosa|    Iris-setosa|[5.0,3.5,1.3,0.3]|
|    Iris-setosa|    Iris-setosa|[5.1,3.3,1.7,0.5]|
|    Iris-setosa|    Iris-setosa|[5.1,3.4,1.5,0.2]|
|    Iris-setosa|    Iris-setosa|[5.1,3.7,1.5,0.4]|
|    Iris-setosa|    Iris-setosa|[5.1,3.8,1.9,0.4]|
|Iris-versicolor|Iris-versicolor|[5.2,2.7,3.9,1.4]|
|    Iris-setosa|    Iris-setosa|[5.4,3.9,1.3,0.4]|
|Iris-versicolor|Iris-versicolor|[5.7,2.8,4.5,1.3]|
|Iris-versicolor|Iris-versicolor|[5.8,2.7,4.1,1.0]|
|    Iris-setosa|    Iris-setosa|[5.8,4.0,1.2,0.2]|
| Iris-virginica|Iris-versicolor|[5.9,3.2,4.8,1.8]|
|Iris-versicolor|Iris-versicolor|[6.1,2.9,4.7,1.4]|
+---------------+---------------+-----------------+
only showing top 20 rows

 评估决策树分类模型:

evaluatorClassifier = MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("accuracy")
 accuracy = evaluatorClassifier.evaluate(predictionsClassifier)
print("Test Error = " + str(1.0 - accuracy))
treeModelClassifier = modelClassifier.stages[2]
print("Learned classification tree model:\n" + str(treeModelClassifier.toDebugString))


#结果---------------------------------------
Test Error = 0.05882352941176472
Learned classification tree model:
DecisionTreeClassificationModel (uid=DecisionTreeClassifier_4e57b26beacfd363271a) of depth 3 with 7 nodes
  If (feature 2 <= 1.9)
   Predict: 2.0
  Else (feature 2 > 1.9)
   If (feature 3 <= 1.6)
    If (feature 2 <= 4.9)
     Predict: 0.0
    Else (feature 2 > 4.9)
     Predict: 1.0
   Else (feature 3 > 1.6)
    Predict: 1.0

 

二.决策树回归模型

from pyspark.ml.regression import DecisionTreeRegressionModel,DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
#训练决策树模型
dtRegressor = DecisionTreeRegressor().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures")
#在pipeline中进行设置
pipelineRegressor = Pipeline().setStages([labelIndexer, featureIndexer, dtRegressor, labelConverter])
#训练决策树模型
modelRegressor = pipelineRegressor.fit(trainingData)
# 进行预测
predictionsRegressor = modelRegressor.transform(testData)
# 查看部分预测结果
predictionsRegressor.select("predictedLabel", "label", "features").show(20)

# 评估模型
evaluatorRegressor = RegressionEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("rmse")
rmse = evaluatorRegressor.evaluate(predictionsRegressor)
print("Root Mean Squared Error (RMSE) on test data = " +str(rmse)) 
treeModelRegressor = modelRegressor.stages[2]
print("Learned regression tree model:\n" + str(treeModelRegressor.toDebugString))

 

预测结果:

+---------------+---------------+-----------------+
| predictedLabel|          label|         features|
+---------------+---------------+-----------------+
|    Iris-setosa|    Iris-setosa|[4.3,3.0,1.1,0.1]|
|    Iris-setosa|    Iris-setosa|[4.6,3.4,1.4,0.3]|
|    Iris-setosa|    Iris-setosa|[4.6,3.6,1.0,0.2]|
|    Iris-setosa|    Iris-setosa|[4.8,3.0,1.4,0.1]|
|    Iris-setosa|    Iris-setosa|[4.8,3.1,1.6,0.2]|
|    Iris-setosa|    Iris-setosa|[4.8,3.4,1.6,0.2]|
|    Iris-setosa|    Iris-setosa|[4.9,3.0,1.4,0.2]|
|    Iris-setosa|    Iris-setosa|[4.9,3.1,1.5,0.1]|
|    Iris-setosa|    Iris-setosa|[5.0,3.5,1.3,0.3]|
|    Iris-setosa|    Iris-setosa|[5.1,3.3,1.7,0.5]|
|    Iris-setosa|    Iris-setosa|[5.1,3.4,1.5,0.2]|
|    Iris-setosa|    Iris-setosa|[5.1,3.7,1.5,0.4]|
|    Iris-setosa|    Iris-setosa|[5.1,3.8,1.9,0.4]|
|Iris-versicolor|Iris-versicolor|[5.2,2.7,3.9,1.4]|
|    Iris-setosa|    Iris-setosa|[5.4,3.9,1.3,0.4]|
|Iris-versicolor|Iris-versicolor|[5.7,2.8,4.5,1.3]|
|Iris-versicolor|Iris-versicolor|[5.8,2.7,4.1,1.0]|
|    Iris-setosa|    Iris-setosa|[5.8,4.0,1.2,0.2]|
| Iris-virginica|Iris-versicolor|[5.9,3.2,4.8,1.8]|
|Iris-versicolor|Iris-versicolor|[6.1,2.9,4.7,1.4]|
+---------------+---------------+-----------------+
only showing top 20 rows

Root Mean Squared Error (RMSE) on test data = 0.24253562503633297
Learned regression tree model:
DecisionTreeRegressionModel (uid=DecisionTreeRegressor_4325a44aff74cf6ff7b3) of depth 3 with 7 nodes
  If (feature 2 <= 1.9)
   Predict: 2.0
  Else (feature 2 > 1.9)
   If (feature 3 <= 1.6)
    If (feature 2 <= 4.9)
     Predict: 0.0
    Else (feature 2 > 4.9)
     Predict: 1.0
   Else (feature 3 > 1.6)
    Predict: 1.0

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值