版本:pyspark = 3.2.0
本文使用工业蒸汽数据集,该数据集包括39个变量:V0到V37,target。
本项目从V0到V37中选择某些变量对target进行预测(回归)。
导包:
from pyspark.sql import SparkSession
from pyspark.ml.feature import ChiSqSelector,VectorAssembler
from pyspark.ml.feature import MaxAbsScaler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder,CrossValidator
具体:
if __name__ =='__main__':
spark = SparkSession.builder.\
appName('test').\
master('local[*]').\
getOrCreate()
data = spark.read.format('csv').option('header',True).\
option('sep','\t').option('encoding', 'utf-8').\
load('./data/zhengqi_train.txt')
#data.show(5)
#数据维度
print('data row count:' + str(data.count()))
print('data column count:' + str(len(data.columns)))
#转换数据类型:string-float
for i in data.columns:
data = data.withColumn(i, data[i].astype('float'))
#data.printSchema()
#特征选择,卡方选择,选择20个
ass = VectorAssembler(inputCols= data.columns, outputCol= 'features')
data_ass = ass.transform(data)
data_new = data_ass.select('features', 'target').withColumnRenamed('target', 'label')
chi = ChiSqSelector(featuresCol= 'features', labelCol= 'label', numTopFeatures= 20, outputCol= 'newfeatures')
model = chi.fit(data_new)
data_new = model.transform(data_new)
#data_new.show(5)
#特征处理
scaler = MaxAbsScaler(inputCol= 'newfeatures', outputCol= 'scalerfeatures')
scalerfeatures = scaler.fit(data_new).transform(data_new)
#scalerfeatures.show(5)
#构建模型,随机森林
data_rf = scalerfeatures.select('scalerfeatures', 'label')
data_rf_train, data_rf_test = data_rf.randomSplit([0.7, 0.3], seed= 100)
rf = RandomForestRegressor(numTrees= 50, maxDepth= 4, maxBins= 32,featuresCol= 'scalerfeatures', labelCol= 'label')
# rfmodel = rf.fit(data_rf_train)
# result = rfmodel.transform(data_rf_test)
#result.show(5)
evaluator = RegressionEvaluator(predictionCol= 'prediction', labelCol= 'label')
# accuracy = evaluator.evaluate(result)
# print('accuracy:' + str(accuracy))
#参数寻优
ParamGrid = ParamGridBuilder().addGrid(rf.numTrees, [50,100,150]).\
addGrid(rf.maxBins, [32,64]).addGrid(rf.maxDepth, [4,5,6]).build()
cv = CrossValidator(estimator= rf, estimatorParamMaps= ParamGrid, evaluator= evaluator)
cvmodel = cv.fit(data_rf_train)
result = cvmodel.transform(data_rf_test)
#result.show()
print(evaluator.evaluate(result))