两个步骤:
1.将特征转换为LabeledPoint数据X,或者将采用vectorAssembler将特征转化为稀疏矩阵
2.然后以libsvm格式保存X.
// 采用vectorAssembler格式
val vectorAssembler = new VectorAssembler().setInputCols(numberfeature).setOutputCol("features")
val pipeline = new Pipeline().setStages(Array(vectorAssembler))
val input = pipeline.fit(data).transform(data).select("features","qimei","label")
// 修改 label类型
var df1 = input
//datafram类型转换,然后添加一列后删去一列
df1 = df1.withColumn("labelTemp",df1("label")*1.0).drop("label").withColumnRenamed("labelTemp","label")
// 随机切分数据
val Array(training, test) = df1.randomSplit(Array(0.8, 0.2),seed=2019)
println("train count:"+training.count())
println("test count:"+test.count())
val train_path =s"hdfs://ss-sng-dc-v2/stage/outface/SNG/g_sng_weishi_ws_growth/julianxu/data/boxmoney/${ftime}/train"
val test_path =s"hdfs://ss-sng-dc-v2/stage/outface/SNG/g_sng_weishi_ws_growth/julianxu/data/boxmoney/${ftime}/test"
// hadoop删除文件
SparkUtil.deleteModel(ss,train_path)
SparkUtil.deleteModel(ss,test_path)
training.select("label","features").write.format("libsvm").save(train_path)
test.select("label","features").write.format("libsvm").save(test_path)
// 或者采用labelpoint格式
import org.Apache.spark.ml.linalg.Vectors
import org.Apache.spark.ml.feature.LabeledPoint
val pos = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0))
val neg = LabeledPoint(0.0, Vectors.sparse(3, Array(0, 2), Array(1.0, 3.0)))
val df = Seq(neg,pos).toDF("label","features")
df.write.format("libsvm").save("data/foo")
def deleteModel(ss:SparkSession,fpath: String): Unit = {
// Spark删除model文件
// 例如:fpath = "hdfs://ss-sng-dc-v2/stage/outface/SNG/g_sng_weishi_ws_growth/julianxu/data/KMeansModel"
val hadoopConf = ss.sparkContext.hadoopConfiguration
val fss = new Path(fpath).getFileSystem(hadoopConf)
if (fss.exists(new Path(fpath))) {
fss.delete(new Path(fpath), true)
println("del:"+fpath)
}
}
参考:
https://www.it-swarm.net/zh/apache-spark/%E5%A6%82%E4%BD%95%E4%BB%8Edataframe%E5%87%86%E5%A4%87%E6%95%B0%E6%8D%AE%E5%88%B0libsvm%E6%A0%BC%E5%BC%8F%EF%BC%9F/828617867/