1. scale
为什么要对输入数据做scale?在《再次实现Logistic Regression(c++)_实现和测试》给出的理由是这样一句话“由于sigmoid函数在计算机中的精度限制,我们必须对实值输入进行归一化处理。” 具体的来说,是指数函数exp在计算中的精度限制,才需要对数据进行处理。
scale的接口为
// scale all of the sample values and put the result into txt
bool ScaleAllSampleValTxt (const char * sFileIn, int iFeatureNum, const char * sFileOut);
输入是原始sample文件,需要制定最大feature数目(其实也可以在读取文件的过程中得知,不过效率会比较低,需要动态维护feature存储空间),scale之后输出到文本文件中。该函数调用了两个private函数:
// read a sample from a line, return false if fail
bool ReadSampleFrmLine (string & sLine, Sample & theSample);
// load all of the samples into sample vector, this is for scale samples
bool LoadAllSamples (const char * sFileName, vector<Sample> & SampleVec);
scaling过程中用到了常数smothing fator,用来避免scaling过程中除数为零的情况
// the minimal float number for smoothing for scaling the input samples
#define SMOOTHFATOR 1e-100
代码实现很简单,如下:
// the input format is: iClassId featureid1:featurevalue1 featureid2:featurevalue2 ...
bool LogisticRegression::ReadSampleFrmLine (string & sLine, Sample & theSample)
{
istringstream isLine (sLine);
if (!isLine)
return false;
// the class index
isLine >> theSample.iClass;
// the feature and its value
string sItem;
while (isLine >> sItem )
{
FeaValNode theNode;
string::size_type iPos = sItem.find (':');
theNode.iFeatureId = atoi (sItem.substr(0, iPos).c_str());
theNode.dValue = atof (sItem.substr (iPos+1).c_str());
theSample.FeaValNodeVec.push_back (theNode);
}
return true;
}
bool LogisticRegression::LoadAllSamples (const char * sFileName, vector<Sample> & SampleVec)
{
ifstream in (sFileName);
if (!in)
{
cerr << "Can not open the file of " << sFileName << endl;
return false;
}
SampleVec.clear();
string sLine;
while (getline (in, sLine))
{
Sample theSample;
if (ReadSampleFrmLine (sLine, theSample))
SampleVec.push_back (theSample);
}
return true;
}
bool LogisticRegression::ScaleAllSampleValTxt (const char * sFileIn, int iFeatureNum, const char * sFileOut)
{
ifstream in (sFileIn);
ofstream out (sFileOut);
if (!in || !out)
{
cerr << "Can not open the file" << endl;
return false;
}
// load all of the samples
vector<Sample> SampleVec;
if (!LoadAllSamples (sFileIn, SampleVec))
return false;
// get the max value of each feature
vector<double> FeaMaxValVec (iFeatureNum, 0.0);
vector<Sample>::iterator p = SampleVec.begin();
while (p != SampleVec.end())
{
vector<FeaValNode>::iterator pFea = p->FeaValNodeVec.begin();
while (pFea != p->FeaValNodeVec.end())
{
if (pFea->iFeatureId < iFeatureNum
&& pFea->dValue > FeaMaxValVec[pFea->iFeatureId])
FeaMaxValVec[pFea->iFeatureId] = pFea->dValue;
pFea++;
}
p++;
}
// smoothing FeaMaxValVec to avoid zero value
vector<double>::iterator pFeaMax = FeaMaxValVec.begin();
while (pFeaMax != FeaMaxValVec.end())
{
*pFeaMax += SMOOTHFATOR;
pFeaMax++;
}
// scale the samples
p = SampleVec.begin();
while (p != SampleVec.end())
{
vector<FeaValNode>::iterator pFea = p->FeaValNodeVec.begin();
while (pFea != p->FeaValNodeVec.end())
{
if (pFea->iFeatureId < iFeatureNum)
pFea->dValue /= FeaMaxValVec[pFea->iFeatureId];
pFea++;
}
p++;
}
// dump the result
p = SampleVec.begin();
while (p != SampleVec.end())
{
out << p->iClass << " ";
vector<FeaValNode>::iterator pFea = p->FeaValNodeVec.begin();
while (pFea != p->FeaValNodeVec.end())
{
out << pFea->iFeatureId << ":" << pFea->dValue << " ";
pFea++;
}
out << "\n";
p++;
}
return true;
}
调用如下:
ScaleAllSampleValTxt ("..\\Data\\SamplesMultClassesTrain.txt", 25334, "..\\Data\\SamplesMultClassesTrainScale.txt");
ScaleAllSampleValTxt ("..\\Data\\SamplesMultClassesTest.txt", 25334, "..\\Data\\SamplesMultClassesTestScale.txt");
觉去了,明天继续码。
转载请注明出处:http://blog.csdn.net/xceman1997/article/details/18428391