优化算法比较
下面是某个blog主写的关于优化算法解法的一些conclusion,第二篇则是某个国外的blog主总结的,貌似这位已经总结成个survey发表了,国内已经有人翻译成中文了,可以结合着看下,先保存起来,目前po主自己刚开始看到 momentum 部分,实现也只到这里,拿各种优化算法来解logistic regression 。结果回头贴出来,先标记个
http://blog.csdn.net/luo123n/article/details/48239963
http://sebastianruder.com/optimizing-gradient-descent/index.html#fn:7
http://mp.weixin.qq.com/s?__biz=MzA3MzI4MjgzMw==&mid=2650720663&idx=3&sn=d9f671f77be23a148d1830448154a545&chksm=871b0de9b06c84ffaf260b9ba2a010108cca62d5ce3dcbd8c98c72c9f786f9cd460b27b496ca&mpshare=1&scene=2&srcid=1121mgll9exVL2Gia7trGTn7&from=timeline#wechat_redirect
https://www.52ml.net/21094.html
/**
* 归一化
*/
private void normalization(){
double max[] = new double [corpus.getFeatureNum()];
double min[] = new double [corpus.getFeatureNum()] ;
for(int i = 0 ;i<corpus.getFeatureNum();i++){
max[i] = 0.0;
min[i] = 0.0;
}
for (LRInstance instance:corpus.getInstances().values()){
for (int i = 0 ;i<corpus.getFeatureNum();i++){
double feature = instance.getFeatureIndex(i);
if(max[i]<feature) {
max[i] = feature;
}else if(min[i]>feature){
min[i] = feature;
}
}
}
for (LRInstance instance:corpus.getInstances().values()){
for (int i = 0 ;i<corpus.getFeatureNum();i++){
if(max[i]==min[i]) continue;
double feature = instance.getFeatureIndex(i);
instance.setFeatureIndex(i, (feature-min[i])/(max[i]-min[i]));
}
}
}
private double sigmod(double z){
return 1/(1+Math.exp(-z));
}
/**
* batch gradient descent : every step use all examples
*/
private void batchGradientDescent(){
for (int k = 0; k <conf.getMaxIter();k++){
for (int i = 0 ; i<corpus.getFeatureNum();i++){
double gradient =0.0;
for(LRInstance instance: corpus.getInstances().values()){
double predict = sigmod(instance.getCurrRTW(weights));
gradient += (instance.getLabel()-predict)*instance.getFeatureIndex(i);
}
//if (theta<conf.getEpsilon())break; // convergence
weights[i]=weights[i]+conf.getShrinkage()*gradient;
}
error(k);
}
}
/**
* stochastic gradient descent : every step use one example
*/
private void stoGradientDescent(){
for (int k = 0; k <conf.getMaxIter();k++){ // iterative
Random random = new Random();
int instanceId = random.nextInt(corpus.getInstancesNum())%(corpus.getInstancesNum())+1;
LRInstance instance = corpus.getInstances().get(instanceId);
for (int i = 0;i<corpus.getFeatureNum();i++){
double predict =sigmod(instance.getCurrRTW(weights));
double gradient = (instance.getLabel()-predict)*instance.getFeatureIndex(i);
//if(theta<conf.getEpsilon()) break; //convergence
weights[i] = weights[i]+conf.getShrinkage()*gradient;
}
error(k);
}
}
/**
* min batch gradient : every step use n examples
* @param sampleNum
*/
private void miniBatchGradientDescent(int sampleNum){
Random random = new Random();
List<LRInstance> samples;
for(int k = 0 ;k<conf.getMaxIter();k++){
samples = new ArrayList<LRInstance>();
//step1:sampling
for (int n = 0 ;n < sampleNum;n++){
int stanceId = random.nextInt(corpus.getInstancesNum())%(corpus.getInstancesNum())+1;
LRInstance instance = corpus.getInstances().get(stanceId);
samples.add(instance);
}
//gradient
for (int i = 0 ;i <corpus.getFeatureNum();i++){
double gradient = 0.0;
for(LRInstance instance:samples){
double predict = sigmod(instance.getCurrRTW(weights));
gradient += (instance.getLabel()-predict)*instance.getFeatureIndex(i);
}
//if(theta<conf.getEpsilon()) break; //convergence
weights[i] = weights[i]+conf.getShrinkage()*gradient;
}
error(k);
}
}
/**
* momentum ()
*
*/
private void momentum (){
System.out.println(conf.getGamma());
Random random = new Random();
LRInstance instance ;
int instanceId ;
double gradient ;
double predict ;
double[] vector = new double[corpus.getFeatureNum()];
for(int i = 0 ;i < vector.length;i++){
vector[i] = 0;
}
for(int k = 0 ;k < conf.getMaxIter();k++){
instanceId = random.nextInt(corpus.getInstancesNum())%(corpus.getInstancesNum())+1;
instance = corpus.getInstances().get(instanceId);
for(int i = 0 ;i < corpus.getFeatureNum();i++){
predict = sigmod(instance.getCurrRTW(weights));
gradient = (instance.getLabel()-predict)*instance.getFeatureIndex(i);
vector[i] = conf.getGamma()*vector[i]+conf.getShrinkage()*gradient; // vector [i]? init is right?
weights[i] = weights[i]+vector[i];
}
error(k);
}
}
中间算 gradient 停止条件那部分运行中被我注释掉了,因为样本量太小了,Epsilon 设置基本无效