这个问题的解决,多亏yihan大佬,非常感谢!
自己项目的代码:
import org.apache.spark.ml.feature.{BucketedRandomProjectionLSH, VectorAssembler}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
//创建SparkSession
val spark: SparkSession = SparkSession.builder()
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.config("spark.rdd.compress", "true")
.config("spark.sql.crossJoin.enabled", "true")
.config("spark.speculation.interval", "10000ms")
.config("spark.sql.tungsten.enabled", "true")
.appName(s"${this.getClass.getSimpleName}")
.enableHiveSupport()
.getOrCreate()
// 数据读取
val one_day_ago = "20220706"
val data = spark.sql(
s"""
|select * from temp_exp20220731_xxxx_${one_day_ago}
|""".stripMargin)
scala> data.printSchema
root
|-- uid: string (nullable = true)
|-- action: string (nullable = true)
|-- hour: string (nullable = true)
|-- action_index: integer (nullable = true)
|-- sum_cnt: long (nullable = true)
scala> data.show()
+----------+------+----+------------+-------+
| uid|action|hour|action_index|sum_cnt|
+----------+------+----+------------+-------+
| 25365743| 1| 18| 0| 1|
| 13163071| 1| 23| 0| 1|
| 44507994| 1| 05| 0| 1|
| 14971072| 1| 16| 0| 1|
| 15929625| 1| 06| 0| 2|
| 17996813| 1| 19| 0| 1|
| 43345973| 1| 20| 0| 1|
| 24271020| 1| 23| 0| 1|
| 67683222| 1| 11| 0| 1|
| 24941854| 1| 11| 0| 1|
| 18849244| 1| 07| 0| 1|
| 40552807| 1| 17| 0| 1|
| 79075502| 1| 21| 0| 1|
| 46305607| 1| 18| 0| 1|
| 29202402| 1| 07| 0| 1|
+----------+------+----+------------+-------+
only showing top 20 rows
data.persist()
// 计算一个参数
val vector_len = data.select(max("action_index")).rdd.first().getInt(0)
System.out.println(s"算出的参数:${vector_len}")
// 数据处理
val dfWithFeat3 = data.repartition(400)
.select("uid","action_index","sum_cnt")
.filter("uid is not null and action_index is not null and sum_cnt is not null")
.rdd
.map(r => (r.getString(0).toDouble, (r.getInt(1), r.getLong(2).toDouble)))
.groupByKey()
.repartition(400)
.map(r => LabeledPoint(r._1, Vectors.sparse(vector_len, r._2.toSeq))) //主要这句在报错,查看源码解决即可
.toDS()
IDEA查看源码快捷键:command + 【点击函数名】即可!
多看日志,多看源码。
参考文献:
How to aggregate a Spark data frame to get a sparse vector using Scala? - Stack Overflowq
其中的代码:
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
val df = Seq((11830,1,8), (11113, 1, 3), (1081, 1,3), (2654, 1, 3), (10633, 1, 3), (11830, 1, 28), (11351, 1, 12), (2737, 1, 26), (11113, 3, 2), (6590, 1, 2)).toDF("id", "weight", "index")
val dfWithFeat = df
.rdd
.map(r => (r.getInt(0), (r.getInt(2), r.getInt(1).toDouble)))
.groupByKey()
.map(r => LabeledPoint(r._1, Vectors.sparse(1000, r._2.toSeq)))
.toDS
dfWithFeat.printSchema
dfWithFeat.show(10, false)
// Exiting paste mode, now interpreting.
root
|-- label: double (nullable = true)
|-- features: vector (nullable = true)
+-------+-----------------------+
|label |features |
+-------+-----------------------+
|11113.0|(1000,[2,3],[3.0,1.0]) |
|2737.0 |(1000,[26],[1.0]) |
|10633.0|(1000,[3],[1.0]) |
|1081.0 |(1000,[3],[1.0]) |
|6590.0 |(1000,[2],[1.0]) |
|11830.0|(1000,[8,28],[1.0,1.0])|
|2654.0 |(1000,[3],[1.0]) |
|11351.0|(1000,[12],[1.0]) |
+-------+-----------------------+
dfWithFeat: org.apache.spark.sql.Dataset[org.apache.spark.mllib.regression.LabeledPoint] = [label: double, features: vector]