当调用LinearRegressionWithSGD.train() 时,代码执行如下:
def train(
input: RDD[LabeledPoint],
numIterations: Int,
stepSize: Double,
miniBatchFraction: Double,
initialWeights: Vector): LinearRegressionModel = {
new LinearRegressionWithSGD(stepSize, numIterations, miniBatchFraction)
.run(input, initialWeights)
}
接着会调用LinearRegressionWithSGD的父类方法 --- GeneralizedLinearAlgorithm.run():
/**
* Run the algorithm with the configured parameters on an input RDD
* of LabeledPoint entries starting from the initial weights provided.
*/
def run(input: RDD[LabeledPoint], initialWeights: Vector): M = {
if (input.getStorageLevel == StorageLevel.NONE) {
logWarning("The input data is not directly cached, which may hurt performance if its"
+ " parent RDDs are also uncached.")
}
// Check the data properties before running the optimizer
if (validateData && !validators.forall(func => func(input))) {
throw new SparkException("Input validation failed.")
}
val scaler = if (useFeatureScaling) {
(new StandardScaler).fit(input.map(x => x.features)) //对各个特征向量做标准化归一化处理,有助于收敛
} else {
null
}
// Prepend an extra variable consisting of all 1.0's for the intercept.
val data = if (addIntercept) { //是否对特征向量增加截距,默认为false
if(useF