import org.apache.spark.ml.classification.{DecisionTreeClassifier, DecisionTreeClassificationModel}
import org.apache.spark.ml.evaluation.{MulticlassClassificationEvaluator, BinaryClassificationEvaluator}
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
// 创建SparkSession
val spark = SparkSession.builder()
.appName(“BilibiliAnalysis”)
.config(“spark.master”, “local”)
.getOrCreate()
// 读取CSV文件并选择所需的列
val filePath = “file:///usr/local/hadoop/bilibili.csv”
val df = spark.read.option(“header”, “true”).csv(filePath)
// 将所有字段的数据类型转换为整型并处理空值
val convertedDF = df.select(
col(“Views”).cast(“int”),
col(“Danmaku_Count”).cast(“int”),
col(“Comment_Count”).cast(“int”),
col(“Favorite_Count”).cast(“int”),
col(“Coin_Count”).cast(“int”),
col(“Share_Count”).cast(“int”),
col(“Like_Count”).cast(“int”),
col(“Partition_Ranking”).cast(“int”)
).na.fill(0) // 填充空值为0
// 创建新的标签列
val labeledDF = convertedDF.withColumn(“label”, when(col(“Partition_Ranking”) <= 10, 1).otherwise(0))
// 使用VectorAssembler将特征列转化为特征向量
val featureCols = Array(“Views”, “Danmaku_Count”, “Comment_Count”, “Favorite_Count”, “Coin_Count”, “Share_Count”, “Like_Count”)
val assembler = new VectorAssembler()
.setInputCols(featureCols)
.setOutputCol(“features”)
val assembledDF = assembler.transform(labeledDF)
// 划分训练集和验证集
val Array(trainData, testData) = assembledDF.randomSplit(Array(0.8, 0.2), seed = 1234)
// 实例化决策树分类器
val dt = new DecisionTreeClassifier()
.setLabelCol(“label”)
.setFeaturesCol(“features”)
// 设置超参数网格
val paramGrid = new ParamGridBuilder()
.addGrid(dt.maxDepth, Array(5, 10, 15))
.addGrid(dt.maxBins, Array(16, 32, 64))
.build()
// 交叉验证
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol(“label”)
.setPredictionCol(“prediction”)
.setMetricName(“accuracy”)
val cv = new CrossValidator()
.setEstimator(dt)
.setEvaluator(evaluator)
.setEstimatorParamMaps(paramGrid.build())
.setNumFolds(5) // 设置交叉验证的折数
// 执行交叉验证,并选择最佳模型
val cvModel = cv.fit(trainData)
// 对测试集进行预测
val predictions = cvModel.transform(testData)
// 获取最佳模型
val bestModel = cvModel.bestModel.asInstanceOf[DecisionTreeClassificationModel]
// 在测试数据集上进行预测
val predictions = bestModel.transform(testData)
// 评估预测准确率
val accuracy = evaluator.evaluate(predictions)
// 输出准确率
println("Test Accuracy: " + accuracy)
// 获取最佳模型
val bestModel = cvModel.bestModel.asInstanceOf[DecisionTreeClassificationModel]
// 在验证集上进行预测
val predictions = bestModel.transform(testData)
// 使用MulticlassClassificationEvaluator评估分类正确率
val multiEvaluator = new MulticlassClassificationEvaluator()
.setLabelCol(“label”)
.setPredictionCol(“prediction”)
.setMetricName(“accuracy”)
val accuracy = multiEvaluator.evaluate(predictions)
println("Multiclass Classification Accuracy: " + accuracy)
// 使用BinaryClassificationEvaluator评估AUC
val binaryEvaluator = new BinaryClassificationEvaluator()
.setLabelCol(“label”)
.setRawPredictionCol(“prediction”)
.setMetricName(“areaUnderROC”)
val auc = binaryEvaluator.evaluate(predictions)
println("Binary Classification AUC: " + auc)
println(s"最佳模型参数:maxDepth = $bestMaxDepth, maxBins = $bestMaxBins")
// 关闭SparkSession
spark.stop()