val embddding_seq_data = ss.read.option("sep","\t")
.csv("/")
.toDF("feedid","embedding")
embddding_seq_data
以tfrecode形式写入hdfs里面:
trainData.write.mode(SaveMode.Overwrite).format("tfrecords").option("recordType", "Example").save(trainPath)
def min_max_norm(sp: SparkSession, df: DataFrame, colName: String): DataFrame = {
import sp.implicits._
/*对df数据的每一列进行归一化
* */
val per: scala.Array[scala.Double] = df.stat.approxQuantile(colName, Array(0.01, 0.99), 0.001)
val low = per(0)
val high = per(1)
println(colName + " min: " + low + " high: " + high)
df.withColumn(colName, when(col(colName) > high, high).otherwise(col(colName)))
.withColumn(colName, when(col(colName) < low, low).otherwise(col(colName)))
.withColumn(colName, (col(colName) - low) / (high - low))
}