org.apache.spark.SparkException: Failed to execute user defined function($anonfun$11: (vector) => ve

1、添加配置参数

spark官网gbdt-regression的案例,但是直接使用会出现标题提示的错误,

需要在vectorindexer加上配置参数 setHanddleInvalid:

    val featureIndexer = new VectorIndexer()
      .setInputCol("features")
      .setOutputCol("indexedFeatures")
      .setMaxCategories(4)
      .setHandleInvalid("skip")//意思是无效值跳过,默认配置是“error”

2、完整代码

main处要记得添加 :Unit

package org.example.practice

import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor}


object GBDT {

  def main(args:Array[String]):Unit= {//:Unit 不能省略,相当于void

    val spark = SparkSession.builder().master("local[4]").getOrCreate()//创建1个SparkSession,本地运行加 local,数字跟CPU核数相关

    spark.sparkContext.setLogLevel("ERROR")//只展示报错日志

    import spark.implicits._


    // Load and parse the data file, converting it to a DataFrame.
    //读取之后,格式有2列,1列是“label",1列是特征向量
    val data = spark.read.format("libsvm").load("E:\\13data\\gbdt\\sample_libsvm_data.txt")

    data.show(numRows = 3,truncate = false)
    data.printSchema()



    // Split the data into training and test sets (30% held out for testing).
    val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))


    trainingData.cache()
    testData.cache()


  val gbdt = new GBDT1(spark)

    gbdt.GbdtRegressionTree(trainingData, testData)

    trainingData.unpersist()
    testData.unpersist()

  }

}

class GBDT1(private val spark: SparkSession)   {

  import spark.implicits._

  def GbdtRegressionTree(trainingData: DataFrame, testData: DataFrame): Unit = {
    // Train a GBT model.

    // Automatically identify categorical features, and index them.
    // Set maxCategories so features with > 4 distinct values are treated as continuous.

    val featureIndexer = new VectorIndexer()
      .setInputCol("features")
      .setOutputCol("indexedFeatures")
      .setMaxCategories(4)
      .setHandleInvalid("skip")//无效值跳过


    val gbt = new GBTRegressor()
      .setLabelCol("label")
      .setFeaturesCol("indexedFeatures")
      .setMaxIter(10)//

    // Chain indexer and GBT in a Pipeline.
    val pipeline = new Pipeline()
      .setStages(Array(featureIndexer, gbt))

    // Train model. This also runs the indexer.
    val model = pipeline.fit(trainingData)

    // Make predictions.
    val predictions = model.transform(testData)

    // Select example rows to display.
    predictions.select("prediction", "label", "features").show(5,truncate = false)

    // Select (prediction, true label) and compute test error.
    val evaluator = new RegressionEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("rmse")
    val rmse = evaluator.evaluate(predictions)

    println(s"Root Mean Squared Error (RMSE) on test data = $rmse")

    val gbtModel = model.stages(1).asInstanceOf[GBTRegressionModel]
    println(s"Learned regression GBT model:\n ${gbtModel.toDebugString}")

  }
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值