2. SGD训练
SGD权重更新方式,同LR二分类的基本相同;所不同的是,二分类LR只用训练一个权重向量,而K分类LR需要训练K-1个权重向量。函数接口如下:
// train by SGD on the sample file
bool TrainSGDOnSampleFile (
const char * sFileName, int iClassNum, int iFeatureNum, // about the samples
double dLearningRate, // about the learning
int iMaxLoop, double dMinImproveRatio // about the stop criteria
);
调用private函数如下:
// initialize the theta matrix with iClassNum and iFeatureNum
bool InitThetaMatrix (int iClassNum, int iFeatureNum);
// calculate the model function output for iClassIndex by feature vector
double CalcFuncOutByFeaVec (vector<FeaValNode> & FeaValNodeVec, int iClassIndex);
// calculate the model function output for all the classes, and return the class index with max probability
int CalcFuncOutByFeaVecForAllClass (vector<FeaValNode> & FeaValNodeVec, vector<double> & ClassProbVec);
// calculate the gradient and update the theta matrix, it returns the cost
double UpdateThetaMatrix (Sample & theSample, vector<double> & ClassProbVec, double dLearningRate);
函数功能分别是初始化权重矩阵、利用现有LR模型参数计算当前类别的预测概率、计算所有类别的预测概率、更新权重。
SGD的函数实现代码如下:
// the sample format: classid feature1_value feature2_value...
bool LogisticRegression::TrainSGDOnSampleFile (
const char * sFileName, int iClassNum, int iFeatureNum, // about the samples
double dLearningRate = 0.05, // about the learning
int iMaxLoop = 1, double dMinImproveRatio = 0.01 // about the stop criteria
)
{
ifstream in (sFileName);
if (!in)
{
cerr << "Can not open the file of " << sFileName << endl;
return false;
}
if (!InitThetaMatrix (iClassNum, iFeatureNum))
return false;
double dCost = 0.0;
double dPreCost = 100.0;
for (int iLoop = 0; iLoop < iMaxLoop; iLoop++)
{
int iSampleNum = 0;
int iErrNum = 0;
string sLine;
while (getline (in, sLine))
{
Sample theSample;
if (ReadSampleFrmLine (sLine, theSample))
{
vector<double> ClassProbVec;
int iPredClassIndex = CalcFuncOutByFeaVecForAllClass (theSample.FeaValNodeVec, ClassProbVec);
if (iPredClassIndex != theSample.iClass)
iErrNum++;
dCost += UpdateThetaMatrix (theSample, ClassProbVec, dLearningRate);
iSampleNum++;
}
}
dCost /= iSampleNum;
double dTmpRatio = (dPreCost - dCost) / dPreCost;
double dTmpErrRate = (double)iErrNum / iSampleNum;
// show info on screen
cout << "In loop " << iLoop << ": current cost (" << dCost << ") previous cost (" << dPreCost << ") ratio (" << dTmpRatio << ") "<< endl;
cout << "And Error rate : " << dTmpErrRate << endl;
if (dTmpRatio < dMinImproveRatio)
break;
else
{
dPreCost = dCost;
dCost = 0.0;
//reset the current reading position of file
in.clear();
in.seekg (0, ios::beg);
}
}
return true;
}
其中计算各个类别概率方式如下:
// it returns the value of f(x) = exp (W*X) for iClassIndex < K, otherwise 1.0 for iClassIndex == K
double LogisticRegression::CalcFuncOutByFeaVec(vector<FeaValNode> & FeaValNodeVec, int iClassIndex)
{
if (iClassIndex >= iClassNum || iClassIndex <0) // wrong situation
return 0.0;
if (iClassIndex == (iClassNum-1) ) // the default class (here is the class with max index)
return 1.0;
double dX = 0.0;
vector<FeaValNode>::iterator p = FeaValNodeVec.begin();
while (p != FeaValNodeVec.end())
{
if (p->iFeatureId < (int)ThetaMatrix.at(iClassIndex).size()) // all input is evil
dX += ThetaMatrix[iClassIndex][p->iFeatureId] * p->dValue;
p++;
}
double dY = exp (dX);
return dY;
}
注意两点:1. 在K个类别中,第K个类别是default类别;2. 此时返回的不是sigmoid函数值,而是指数函数值。最终的概率在如下代码中计算:
// the class probability is calculated by :
// f(x) = exp (W*X) / {1.0 + sum_exp (W*X)} as long as iClassIndex < K
// f(x) = 1.0 / {1.0 + sum_exp (W*X)} as long as iClassIndex == K
int LogisticRegression::CalcFuncOutByFeaVecForAllClass (vector<FeaValNode> & FeaValNodeVec, vector<double> & ClassProbVec)
{
ClassProbVec.clear();
ClassProbVec.resize (iClassNum, 0.0);
double dSum = 1.0;
for (int i=0; i<iClassNum; i++)
{
ClassProbVec.at(i) = CalcFuncOutByFeaVec (FeaValNodeVec, i);
dSum += ClassProbVec.at(i);
}
double dMaxProb = 0.0;
int iClassMaxProb = -1;
for (int i=0; i<iClassNum; i++)
{
ClassProbVec.at(i) /= dSum;
if (ClassProbVec.at(i) > dMaxProb)
{<pre code_snippet_id="161875" snippet_file_name="blog_20140118_5_8754114" name="code" class="cpp"><pre code_snippet_id="161875" snippet_file_name="blog_20140118_5_8754114" name="code" class="cpp"><span style="white-space:pre"> </span>dMaxProb = ClassProbVec.at(i);
iClassMaxProb = i;
}}return iClassMaxProb;}
计算出的概率实际上是softmax概率。权重更新函数:
// the update formula is : theta_new = theta_old - dLearningRate * (dY - iClass) * dXi
double LogisticRegression::UpdateThetaMatrix (Sample & theSample, vector<double> & ClassProbVec, double dLearningRate)
{
double dCost = 0.0;
for (int i=0; i<iClassNum-1; i++)
{
if (i == theSample.iClass)
{
vector<FeaValNode>::iterator p = theSample.FeaValNodeVec.begin();
while (p != theSample.FeaValNodeVec.end())
{
if (p->iFeatureId < (int)ThetaMatrix[i].size())
{
double dGradient = (ClassProbVec[i] - 1.0) * p->dValue;
double dDelta = dGradient * dLearningRate;
ThetaMatrix[i][p->iFeatureId] -= dDelta;
}
p++;
}
// cost = log(dY) when the sample class is the predicted class, otherwise cost = log(1.0 - dY)
dCost -= log (ClassProbVec[i]);
}
else
{
vector<FeaValNode>::iterator p = theSample.FeaValNodeVec.begin();
while (p != theSample.FeaValNodeVec.end())
{
if (p->iFeatureId < (int)ThetaMatrix[i].size())
{
double dGradient = ClassProbVec[i] * p->dValue;
double dDelta = dGradient * dLearningRate;
ThetaMatrix[i][p->iFeatureId] -= dDelta;
}
p++;
}
// cost = log(dY) when the sample class is the predicted class, otherwise cost = log(1.0 - dY)
dCost -= log (1.0 - ClassProbVec[i]);
}
}
return dCost;
}
完。
转载请注明出处:http://blog.csdn.net/xceman1997/article/details/18449317