from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.ml.feature import Normalizer, VectorAssembler, StandardScaler, StringIndexer
from pyspark.sql import SparkSession, types
# aa=aa[['tempcabinetnacelle_1sec','blade3tempbattbox_1sec','tempcabinetnacelle_1sec','label']]
# trainData=aa[['tempcabinetnacelle_1sec','blade3tempbattbox_1sec','tempcabinetnacelle_1sec','label']]
spark = SparkSession.builder.appName("appName111").enableHiveSupport().getOrCreate()
df = spark.read.csv('/root/a.csv', header=True)
df=df[['tempcabinetnacelle_1sec','blade3tempbattbox_1sec','blade1tempbattbox_1sec','label']]
##类型转换
df=df.withColumn('tempcabinetnacelle_1sec', df['tempcabinetnacelle_1sec'].cast(types.FloatType()))
df=df.withColumn('blade3tempbattbox_1sec', df['blade3tempbattbox_1sec'].cast(types.FloatType()))
df=df.withColumn('blade1tempbattbox_1sec', df['blade1tempbattbox_1sec'].cast(types.FloatType()))
df=df.withColumn('label', df['label'].cast(types.IntegerType()))
##将特征转化为featute
df_assembler = VectorAssembler(inputCols=['tempcabinetnacelle_1sec','blade3tempbattbox_1sec','blade1tempbattbox_1sec'], outputCol="features")
df = df_assembler.transform(df)
df.show()
rf = RandomForestClassifier(numTrees=int(2), maxDepth=int(4), labelCol='label', seed=11)
# ###训练
model = rf.fit(df)
ff=model.featureImportances
print(len(ff))
importancesList=[float(col) for col in ff]
colList=['tempcabinetnacelle_1sec','blade3tempbattbox_1sec','blade1tempbattbox_1sec']
result=dict(zip(colList,importancesList))
print(result)`在这里插入代码片`
pyspark随机森林重要性
最新推荐文章于 2021-12-29 23:07:54 发布