/**
* Loads labeled data in the LIBSVM format into an RDD[LabeledPoint].
* The LIBSVM format is a text-based format used by LIBSVM and LIBLINEAR.
* Each line represents a labeled sparse feature vector using the following format:
* {{{label index1:value1 index2:value2 ...}}}
* where the indices are one-based and in ascending order.
* This method parses each line into a [[org.apache.spark.mllib.regression.LabeledPoint]],
* where the feature indices are converted to zero-based.
*
* @param sc Spark context
* @param path file or directory path in any Hadoop-supported file system URI
* @param numFeatures number of features, which will be determined from the input data if a
* nonpositive value is given. This is useful when the dataset is already split
* into multiple files and you want to load them separately, because some
* features may not present in certain files, which leads to inconsistent
* feature dimensions.
* @param minPartitions min number of partitions
* @return labeled data stored as an RDD[LabeledPoint]
*/
def loadLibSVMFile(
sc: SparkContext,
path: String,
numFeatures: Int,
minPartitions: Int): RDD[LabeledPoint] = {
val parsed = sc.textFile(path, minPartitions)
//去除前后空白
.map(_.trim)
.filter(line => !(line.isEmpty || line.startsWith("#")))
.map { line =>
val items = line.split(' ')
val label = items.head.toDouble
val (indices, values) = items.tail.filter(_.nonEmpty).map { item =>
val indexAndValue = item.split(':')
val index = indexAndValue(0).toInt - 1 // Convert 1-based indices to 0-based.
val value = indexAndValue(1).toDouble
(index, value)
}.unzip
(label, indices.toArray, values.toArray)
}
// Determine number of features.
val d = if (numFeatures > 0) {
numFeatures
} else {
parsed.persist(StorageLevel.MEMORY_ONLY)
parsed.map { case (label, indices, values) =>
indices.lastOption.getOrElse(0)
}.reduce(math.max) + 1
}
parsed.map { case (label, indices, values) =>
LabeledPoint(label, Vectors.sparse(d, indices, values))
}
}
loadLibSVMFile 源码
最新推荐文章于 2023-12-27 18:07:42 发布