spark machine pipeline流程
Note:
- spark默认predict输出prediction column
1.Loading the data
initialDF = (spark.read # Our DataFrameReader
.option("header", "true") # Let Spark know we have a header
.option("inferSchema", "true") # Infering the schema (it is a small dataset)
.csv(fileName) # Location of our data
.cache() # Mark the DataFrame as cached.
)
2.Preprocessing the data
- StringIndexer(categorical columns)
from pyspark.ml.feature import StringIndexer
workingdayStringIndexer = StringIndexer(
inputCol="workingday",
outputCol="workingdayIndex").setHandleInvalid("keep")
- VectorAssembler
from pyspark.ml.feature import VectorAssembler
assemblerInputs = [
"mnth", "temp", "hr", "hum", "atemp", "windspeed", # Our numerical features
"yrIndex", "workingdayIndex", "weathersitIndex"] # Our new categorical features
vectorAssembler = VectorAssembler(
inputCols=assemblerInputs,
outputCol="features")
3.Train/Test Split
trainDF, testDF = preprocessedDF.randomSplit([0.7, 0.3], # 70-30 split seed=42) # For reproducibility
4.Create a Machine Learning Pipeline
from pyspark.ml import Pipeline
pipeline = Pipeline().setStages([
workingdayStringIndexer, # categorize workingday
weathersitStringIndexer, # categorize weathersit
yrStringIndexer, # categorize yr
vectorAssembler, # assemble the feature vector for all columns
rfr])
5.Train the model
pipelineModel = pipeline.fit(trainDF)
6.Evaluate the model
from pyspark.ml.regression import RandomForestRegressionModel
rfrm = pipelineModel.stages[-1] # The RFRM is in the last stage of the mode
# Zip the list of features with their scores
scores = zip(assemblerInputs, rfrm.featureImportances)
# And pretty print 'em
for x in scores: print("%-15s = %s" % x)
print("-"*80)
7.Making Predictions
predictionsDF = pipelineModel.transform(testDF)
8.Evaluate
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator().setLabelCol("cnt")
rmse = evaluator.evaluate(predictionsDF) #calcualte rmse of cnt vs predictions
print("Test RMSE = %f" % rmse)
9.ParamGrid
from pyspark.ml.tuning import ParamGridBuilder
paramGrid = (ParamGridBuilder()
.addGrid(rfr.maxDepth, [2, 5, 10])
.addGrid(rfr.numTrees, [10, 50])
.build())
10.Cross-Validation
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = (RegressionEvaluator()
.setLabelCol("cnt")
.setPredictionCol("prediction"))
cv = (CrossValidator()
.setEstimator(pipeline)
.setEvaluator(evaluator)
.setEstimatorParamMaps(paramGrid)
.setNumFolds(3)
.setSeed(27))
11.A New Mode
cvModel = cv.fit(trainDF)