Hypothesis testing
样例
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.stat.ChiSquareTest
import org.apache.spark.sql.SparkSession
object HypothesisTestingExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder.appName("HypothesisTestingExample").getOrCreate()
spark.sparkContext.setLogLevel("WARN")
val data = Seq(
(0.0, Vectors.dense(0.5, 10.0)),
(0.0, Vectors.dense(1.5, 20.0)),
(1.0, Vectors.dense(1.5, 30.0)),
(0.0, Vectors.dense(3.5, 30.0)),
(0.0, Vectors.dense(3.5, 40.0)),
(1.0, Vectors.dense(3.5, 40.0))
)
import spark.implicits._
val df = data.toDF("label", "features")
val chi = ChiSquareTest.test(df, "features", "label").head
println("pValues = " + chi.getAs[Vector](0))
println("degreesOfFreedom = " + chi.getSeq[Int](1).mkString("[", ",", "]"))
println("statistics = " + chi.getAs[Vector](2))
}
}
结果
pValues = [0.6872892787909721,0.6822703303362126]
degreesOfFreedom = [2,3]
statistics = [0.75,1.5]