对于sparksql处理不了的类型,可以使用spark.udf.register自定义函数方法处理。
spark.udf.register("getPCA0", (s: Vector) => s(0))
spark.udf.register("getPCA1", (s: Vector) => s(1))
val sql = spark.sql("select getPCA0(pcaFeatures) as pca0,getPCA1(pcaFeatures) as pca1,prediction from predictions")
sql.show(10, false)
val model = PipelineModel.load(modelPath)
model.transform(df).createOrReplaceTempView("predictions")
spark.udf.register("getDga", (s: Vector) => s(1))
val sql = spark.sql("select domain_key as row_key,domain, getDga(probability) as probability from predictions where prediction == 1.0").cache()