利用决策树算法进行预测

利用决策树算法进行预测

from pyspark.ml.feature import StringIndexer, VectorAssembler

df_train = spark.read.csv('./data/titanic-train.csv',header=True,inferSchema=True).cache()

df_train = df_train.fillna({'Age': round(29.699,0)})
df_train = df_train.fillna({'Embarked': 'S'})

labelIndexer = StringIndexer(inputCol="Embarked", outputCol="iEmbarked")
model = labelIndexer.fit(df_train)
df_train = model.transform(df_train)

labelIndexer = StringIndexer(inputCol="Sex", outputCol="iSex")
model = labelIndexer.fit(df_train)
df_train = model.transform(df_train)

# 选取数值类型的字段
features = ['Pclass', 'iSex', 'Age', 'SibSp', 'Parch', 'Fare', 'iEmbarked','Survived']
train_features = df_train[features]

# 将多列转换成向量
df_assembler = VectorAssembler(inputCols=['Pclass', 'iSex', 'Age', 'SibSp','Parch', 'Fare', 'iEmbarked'], outputCol="features")
train = df_assembler.transform(train_features)


from pyspark.ml.classification import DecisionTreeClassifier
#DecisionTree模型
dtree = DecisionTreeClassifier(labelCol="Survived", featuresCol="features")
treeModel = dtree.fit(train)

#打印treeModel
print(treeModel.toDebugString)


#利用模型的transform方法对训练数据train进行预测
dt_predictions = treeModel.transform(train)
# 从预测集合中选择 "prediction", "Survived", "features" 这三个字段进行显示
dt_predictions.select("prediction", "Survived", "features").show()


from pyspark.ml.evaluation import MulticlassClassificationEvaluator

multi_evaluator = MulticlassClassificationEvaluator(labelCol = 'Survived', metricName = 'accuracy')

print('Decision Tree Accu:', multi_evaluator.evaluate(dt_predictions))


注释:
accuracy 精度


 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值