参考地址:
1、http://spark.apache.org/docs/latest/ml-guide.html
2、https://github.com/apache/spark/tree/v2.2.0
3、http://spark.apache.org/docs/latest/ml-classification-regression.html
Classification
Logistic regression
Binomial logistic regression
from pyspark.ml.classification import LogisticRegression # Load training data training = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) # Fit the model lrModel = lr.fit(training) # Print the coefficients and intercept for logistic regression print("Coefficients: " + str(lrModel.coefficients)) print("Intercept: " + str(lrModel.intercept)) # We can also use the multinomial family for binary classification mlr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial") # Fit the model mlrModel = mlr.fit(training) # Print the coefficients and intercepts for logistic regression with multinomial family print("Multinomial coefficients: " + str(mlrModel.coefficientMatrix)) print("Multinomial intercepts: " + str(mlrModel.interceptVector))
from pyspark.ml.classification import LogisticRegression # Extract the summary from the returned LogisticRegressionModel instance trained # in the earlier example trainingSummary = lrModel.summary # Obtain the objective per iteration objectiveHistory = trainingSummary.objectiveHistory print("objectiveHistory:") for objective in objectiveHistory: print(objective) # Obtain the receiver-operating characteristic as a dataframe and areaUnderROC. trainingSummary.roc.show() print("areaUnderROC: " + str(trainingSummary.areaUnderROC)) # Set the model threshold to maximize F-Measure fMeasure = trainingSummary.fMeasureByThreshold maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head() bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \ .select('threshold').head()['threshold'] lr.setThreshold(bestThreshold)
Multinomial logistic regression
from pyspark.ml.classification import LogisticRegression # Load training data training = spark \ .read \ .format("libsvm") \ .load("data/mllib/sample_multiclass_classification_data.txt") lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) # Fit the model lrModel = lr.fit(training) # Print the coefficients and intercept for multinomial logistic regression print("Coefficients: \n" + str(lrModel.coefficientMatrix)) print("Intercept: " + str(lrModel.interceptVector))
Decision tree classifier
from pyspark.ml import Pipeline from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.evaluation import MulticlassClassificationEvaluator # Load the data stored in LIBSVM format as a DataFrame. data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data) # Automatically identify categorical features, and index them. # We specify maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a DecisionTree model. dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") # Chain indexers and tree in a Pipeline pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) # Train model. This also runs the indexers. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select example rows to display. predictions.select("prediction", "indexedLabel", "features").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g " % (1.0 - accuracy)) treeModel = model.stages[2] # summary only print(treeModel)
Random forest classifier
from pyspark.ml import Pipeline from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer from pyspark.ml.evaluation import MulticlassClassificationEvaluator # Load and parse the data file, converting it to a DataFrame. data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data) # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10) # Convert indexed labels back to original labels. labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels) # Chain indexers and forest in a Pipeline pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter]) # Train model. This also runs the indexers. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select example rows to display. predictions.select("predictedLabel", "label", "features").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) rfModel = model.stages[2] print(rfModel) # summary only
Gradient-boosted tree classifier
from pyspark.ml import Pipeline from pyspark.ml.classification import GBTClassifier from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.evaluation import MulticlassClassificationEvaluator # Load and parse the data file, converting it to a DataFrame. data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data) # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a GBT model. gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10) # Chain indexers and GBT in a Pipeline pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt]) # Train model. This also runs the indexers. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select example rows to display. predictions.select("prediction", "indexedLabel", "features").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) gbtModel = model.stages[2] print(gbtModel) # summary only
Multilayer perceptron classifier
from pyspark.ml.classification import MultilayerPerceptronClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator # Load training data data = spark.read.format("libsvm")\ .load("data/mllib/sample_multiclass_classification_data.txt") # Split the data into train and test splits = data.randomSplit([0.6, 0.4], 1234) train = splits[0] test = splits[1] # specify layers for the neural network: # input layer of size 4 (features), two intermediate of size 5 and 4 # and output of size 3 (classes) layers = [4, 5, 4, 3] # create the trainer and set its parameters trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) # train the model model = trainer.fit(train) # compute accuracy on the test set result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))
Linear Support Vector Machine
from pyspark.ml.classification import LinearSVC # Load training data training = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") lsvc = LinearSVC(maxIter=10, regParam=0.1) # Fit the model lsvcModel = lsvc.fit(training) # Print the coefficients and intercept for linearsSVC print("Coefficients: " + str(lsvcModel.coefficients)) print("Intercept: " + str(lsvcModel.intercept))
One-vs-Rest classifier (a.k.a. One-vs-All)
from pyspark.ml.classification import LogisticRegression, OneVsRest from pyspark.ml.evaluation import MulticlassClassificationEvaluator # load data file. inputData = spark.read.format("libsvm") \ .load("data/mllib/sample_multiclass_classification_data.txt") # generate the train/test split. (train, test) = inputData.randomSplit([0.8, 0.2]) # instantiate the base classifier. lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True) # instantiate the One Vs Rest Classifier. ovr = OneVsRest(classifier=lr) # train the multiclass model. ovrModel = ovr.fit(train) # score the model on test data. predictions = ovrModel.transform(test) # obtain evaluator. evaluator = MulticlassClassificationEvaluator(metricName="accuracy") # compute the classification error on test data. accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy))
Naive Bayes
from pyspark.ml.classification import NaiveBayes from pyspark.ml.evaluation import MulticlassClassificationEvaluator # Load training data data = spark.read.format("libsvm") \ .load("data/mllib/sample_libsvm_data.txt") # Split the data into train and test splits = data.randomSplit([0.6, 0.4], 1234) train = splits[0] test = splits[1] # create the trainer and set its parameters nb = NaiveBayes(smoothing=1.0, modelType="multinomial") # train the model model = nb.fit(train) # select example rows to display. predictions = model.transform(test) predictions.show() # compute accuracy on the test set evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test set accuracy = " + str(accuracy))
Regression
Linear regression
from pyspark.ml.regression import LinearRegression # Load training data training = spark.read.format("libsvm")\ .load("data/mllib/sample_linear_regression_data.txt") lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) # Fit the model lrModel = lr.fit(training) # Print the coefficients and intercept for linear regression print("Coefficients: %s" % str(lrModel.coefficients)) print("Intercept: %s" % str(lrModel.intercept)) # Summarize the model over the training set and print out some metrics trainingSummary = lrModel.summary print("numIterations: %d" % trainingSummary.totalIterations) print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory)) trainingSummary.residuals.show() print("RMSE: %f" % trainingSummary.rootMeanSquaredError) print("r2: %f" % trainingSummary.r2)
Generalized linear regression
from pyspark.ml.regression import GeneralizedLinearRegression # Load training data dataset = spark.read.format("libsvm")\ .load("data/mllib/sample_linear_regression_data.txt") glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3) # Fit the model model = glr.fit(dataset) # Print the coefficients and intercept for generalized linear regression model print("Coefficients: " + str(model.coefficients)) print("Intercept: " + str(model.intercept)) # Summarize the model over the training set and print out some metrics summary = model.summary print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors)) print("T Values: " + str(summary.tValues)) print("P Values: " + str(summary.pValues)) print("Dispersion: " + str(summary.dispersion)) print("Null Deviance: " + str(summary.nullDeviance)) print("Residual Degree Of Freedom Null: " + str(summary.residualDegreeOfFreedomNull)) print("Deviance: " + str(summary.deviance)) print("Residual Degree Of Freedom: " + str(summary.residualDegreeOfFreedom)) print("AIC: " + str(summary.aic)) print("Deviance Residuals: ") summary.residuals().show()
Decision tree regression
from pyspark.ml import Pipeline from pyspark.ml.regression import DecisionTreeRegressor from pyspark.ml.feature import VectorIndexer from pyspark.ml.evaluation import RegressionEvaluator # Load the data stored in LIBSVM format as a DataFrame. data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Automatically identify categorical features, and index them. # We specify maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a DecisionTree model. dt = DecisionTreeRegressor(featuresCol="indexedFeatures") # Chain indexer and tree in a Pipeline pipeline = Pipeline(stages=[featureIndexer, dt]) # Train model. This also runs the indexer. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select example rows to display. predictions.select("prediction", "label", "features").show(5) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator( labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse) treeModel = model.stages[1] # summary only print(treeModel)
Random forest regression
from pyspark.ml import Pipeline from pyspark.ml.regression import RandomForestRegressor from pyspark.ml.feature import VectorIndexer from pyspark.ml.evaluation import RegressionEvaluator # Load and parse the data file, converting it to a DataFrame. data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. rf = RandomForestRegressor(featuresCol="indexedFeatures") # Chain indexer and forest in a Pipeline pipeline = Pipeline(stages=[featureIndexer, rf]) # Train model. This also runs the indexer. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select example rows to display. predictions.select("prediction", "label", "features").show(5) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator( labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse) rfModel = model.stages[1] print(rfModel) # summary only
Gradient-boosted tree regression
from pyspark.ml import Pipeline from pyspark.ml.regression import GBTRegressor from pyspark.ml.feature import VectorIndexer from pyspark.ml.evaluation import RegressionEvaluator # Load and parse the data file, converting it to a DataFrame. data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a GBT model. gbt = GBTRegressor(featuresCol="indexedFeatures", maxIter=10) # Chain indexer and GBT in a Pipeline pipeline = Pipeline(stages=[featureIndexer, gbt]) # Train model. This also runs the indexer. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select example rows to display. predictions.select("prediction", "label", "features").show(5) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator( labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse) gbtModel = model.stages[1] print(gbtModel) # summary only
Survival regression
from pyspark.ml.regression import AFTSurvivalRegression from pyspark.ml.linalg import Vectors training = spark.createDataFrame([ (1.218, 1.0, Vectors.dense(1.560, -0.605)), (2.949, 0.0, Vectors.dense(0.346, 2.158)), (3.627, 0.0, Vectors.dense(1.380, 0.231)), (0.273, 1.0, Vectors.dense(0.520, 1.151)), (4.199, 0.0, Vectors.dense(0.795, -0.226))], ["label", "censor", "features"]) quantileProbabilities = [0.3, 0.6] aft = AFTSurvivalRegression(quantileProbabilities=quantileProbabilities, quantilesCol="quantiles") model = aft.fit(training) # Print the coefficients, intercept and scale parameter for AFT survival regression print("Coefficients: " + str(model.coefficients)) print("Intercept: " + str(model.intercept)) print("Scale: " + str(model.scale)) model.transform(training).show(truncate=False)
Isotonic regression
from pyspark.ml.regression import IsotonicRegression # Loads data. dataset = spark.read.format("libsvm")\ .load("data/mllib/sample_isotonic_regression_libsvm_data.txt") # Trains an isotonic regression model. model = IsotonicRegression().fit(dataset) print("Boundaries in increasing order: %s\n" % str(model.boundaries)) print("Predictions associated with the boundaries: %s\n" % str(model.predictions)) # Makes predictions. model.transform(dataset).show()