LR 做多分类的笔记

最新推荐文章于 2022-12-09 22:25:16 发布

littlexidong

最新推荐文章于 2022-12-09 22:25:16 发布

阅读量2.3k

点赞数 1

分类专栏：个人学习文章标签： lr 多分类

本文链接：https://blog.csdn.net/littlexidong/article/details/39692711

版权

个人学习专栏收录该内容

19 篇文章 0 订阅

订阅专栏

1. 从概率的角度出发，推断一个样本的后验概率为：

其中：

4.63 式可以有比较简洁的形式，例如：线性表达式。

2. 假定P(x| Ck) 为正态分布，

则 lnp(x|Ck)p(Ck) 可以表示为线性的表达式如下：

3. 求解模型参数：

4. 本质上分类走概率模型比较靠谱，直观上某一个地方的点密集，可以说明在该类的概率搞。使用平方误差是，距离无法衡量到某类的距离。

但是 SVC 也用了类似的距离，但是 SVC 只用了支持向量，且投影到高维空间。

5. sample code：

#include <string>

#include <vector>

#include <cmath>

#include <map>

#include "base/flags.h"

#include "base/string_util.h"

#include "utils/hash_tables.h"

#include "common/file/simple_line_reader.h"







DEFINE_string(train_path, "./test.txt", "trainning file");

DEFINE_double(lambda, 0.001, "the weight of regularation");

DEFINE_double(alpha, 0.1, "the learning rate");

DEFINE_int32(n, 5000, "iteration times");





// origianl data;

struct DataSample {

  std::string                      label;

  double                           predict_prob;

  utils::hash_map<std::string, double>  features;



  void AddFeature(const std::string& fn, const double& v) {

    features[fn] = v;

  }

};









// inner_class label are: [0, 1, 2,  (label_count_-1)]

class TrainningDataSet {

public:

  TrainningDataSet() {

    label_count_ = 0;

    inner_feature_map_["cb"] = 0;

    outer_feature_map_[0] = "cb";

    feature_count_ = 1;

  }



  // format like:  "A 1:0.2 2:0.4 3:77 4:0.3"

  bool LoadSamplesFromFile(const std::string& file_path) {

    file::SimpleLineReader  line_reader;

    line_reader.OpenOrDie(file_path);

    std::vector<std::string> lines;

    line_reader.ReadLines(&lines);



    for (size_t i = 0; i < lines.size(); ++i) {

      std::vector<std::string> parts;

      DataSample sample;

      SplitString(lines[i], ' ', &parts);

      

      sample.label = parts[0];

      AddLabel(parts[0]);



      for (size_t j = 1; j < parts.size(); ++j) {

        std::vector<std::string> fn_v;

        SplitString(parts[j], ':', &fn_v);

        if (fn_v.size() != 2) {

          continue;

        }

        double v = 0.0f;

        StringToDouble(fn_v[1], &v);

        sample.AddFeature(fn_v[0], v);

        AddFeature(fn_v[0]);

      }

      samples_.push_back(sample);

    }

    return true;

  }



 



  void Train() {

    AllocAuxParam();

    TrainInternal(FLAGS_n);

    Predict();

    FreeAuxParam();

  }



  void Predict() {

    double prob[100];

    // Xn

    for (auto it = samples_.begin(); it != samples_.end(); ++it) {

      for (int k = 0; k < label_count_; ++k) {

        prob[k] = w[k][0]*1.0f;

        for (auto  sit = it->features.begin(); sit != it->features.end(); ++sit) {

          prob[k] += sit->second*w[k][inner_feature_map_[sit->first]];

        }

        prob[k] = exp(prob[k]);

      }



      double total_exp = 0.0f;

      for (int k = 0; k < label_count_; ++k) {

        total_exp += prob[k];

      }



      it->predict_prob = prob[inner_label_map_[it->label]]/total_exp;

      VLOG(0) << "sampel predict [" << it->label << "]: " << it->predict_prob;

    }

  }



  void TrainInternal(int32 count) {

    for (int32 i = 0; i < count; ++i) {

      //VLOG(0) << "training iteration:  " << (i+1);



      // caculate post_prob: P(Ci | x)

      for (size_t n = 0; n < samples_.size(); ++n) {

        const DataSample& sample = samples_[n];



        for (int32 k = 0; k < label_count_; ++k) {

          // W*X,  x[0] = 1;

          post_prob[n][k] = 1.0f*w[k][0];

          for (auto it = sample.features.begin(); it != sample.features.end(); ++it) {

            std::string outter_feature_idx = it->first;

            double val = it->second;

            int32 feature_idx = inner_feature_map_[outter_feature_idx];

            post_prob[n][k] += val*w[k][feature_idx];

          }

          post_prob[n][k] = exp(post_prob[n][k]);

        }



        double exp_total = 0.0f;

        for (int32 k = 0; k < label_count_; ++k) {

          exp_total += post_prob[n][k];

        }

        for (int32 k = 0; k < label_count_; ++k) {

         post_prob[n][k] /= exp_total;

         //VLOG(0) << "P(C" << k << "|X" << n << ") = " << post_prob[n][k];

        }

      }



      // caculate gradient, (E)/(Wk)

      for (int32 k = 0; k < label_count_; ++k) {

        for (int32 d = 0; d < feature_count_; ++d) {

          grad[k][d] = 0.0f;

        }



        // iteration on every sample

        for (size_t n = 0; n < samples_.size(); ++n) {

          // iteration on every dimension

          double Tnk = GetTnk(n, k);

          double Ynk = post_prob[n][k];

          for (int32 d = 0; d < feature_count_; ++d) {

            grad[k][d] += GetXnd(n, d)*(Ynk - Tnk);

          }

        }



        //std::string w_str;

        for (int32 d = 0; d < feature_count_; ++d) {

          grad[k][d] += w[k][d]*FLAGS_lambda;

          w[k][d] -= FLAGS_alpha*grad[k][d];

          //w_str.append(outer_feature_map_[d]).append(":").append(DoubleToString(w[k][d])).append(",");

        }

        //VLOG(0) << "[w" << k << "]: " << w_str;

      }

    }

  }



  void Dump() {

    utils::hash_map<std::string, int>::iterator it;

    for (it = inner_label_map_.begin(); it != inner_label_map_.end(); ++it) {

      VLOG(0) << "lable: " << it->first << ", " << it->second;

    }



    for (it = inner_feature_map_.begin(); it != inner_feature_map_.end(); ++it) {

      VLOG(0) << "featu: " << it->first << ", " << it->second;

    }

  }



private:

  double**   w;           // w[k][d]    update:  w[k] = w[k] - alpha*(grad[k])

  double**   grad;        // grad[k][d] update:  grad[k] = (Ynk - Tnk)*Xn*lambda;

  double**   post_prob;   // post_prob[n][k] update:  p[n][k] = P(Ck | xn);





  std::vector<DataSample>  samples_;



  utils::hash_map<std::string, int>  inner_label_map_;  // 'A' -> 1    'B' -> 2

  utils::hash_map<int, std::string>  outer_label_map_;  //   1 -> 'A'    2 -> 'B'

  int32 label_count_;



  int AddLabel(const std::string& outer_label) {

    utils::hash_map<std::string, int>::iterator it = inner_label_map_.find(outer_label);

    if (it == inner_label_map_.end()) {

      inner_label_map_[outer_label] = label_count_;

      outer_label_map_[label_count_] = outer_label;

      label_count_++;

    }

    return it->second;

  }



  utils::hash_map<std::string, int>  inner_feature_map_;      // "const_bias" -> 0,  "1" -> 1,  "2" -> 2,  "url_host_big" -> feature_count

  utils::hash_map<int, std::string>  outer_feature_map_;

  int32 feature_count_;



  void AddFeature(const std::string& feature_name) {

    utils::hash_map<std::string, int>::iterator it = inner_feature_map_.find(feature_name);

    if (it == inner_feature_map_.end()) {

      inner_feature_map_[feature_name] = feature_count_;

      outer_feature_map_[feature_count_] = feature_name;

      feature_count_++;

    }

  }



  double GetXnd(const int32& n, const int32& d) {

    double ret = 0.0f;

    if (d == 0) {

      ret = 1.0f;

    } else {

      DataSample& sample = samples_[n];

      std::string& ol = outer_feature_map_[d];

      auto it = sample.features.find(ol);

      if (it != sample.features.end()) {

        ret = it->second;

      }

    }

    //VLOG(0) << "X(" << n << "," << d << ") = " << ret;

    return ret;

  }





  double GetTnk(const int32& n, const int32& k) {

    double ret = 0.0f;

    if (inner_label_map_[samples_[n].label] == k) {

      ret = 1.0f;

    }

    //VLOG(0) << "T(" << n << "," << k << ") = " << ret;

    return ret;

  }







 void FreeAuxParam() {

    for (int k = 0; k < label_count_; ++k) {

      delete w[k];

      delete grad[k];

    }

    delete w;

    delete grad;



    for (size_t n = 0; n < samples_.size(); ++n) {

      delete post_prob[n];

    }

    delete post_prob;

  }



  void AllocAuxParam() {

    // w[k][d], grad[k][d]

    w = new double*[label_count_];

    grad = new double*[label_count_];

    for (int k = 0; k < label_count_; ++k) {

      w[k] = new double[feature_count_];

      grad[k] = new double[feature_count_];

      for (int f = 0; f < feature_count_; ++f) {

        w[k][f] = 0.0f;

      }

    }



    // post_prob[n][k]

    post_prob = new double*[samples_.size()];

    for (size_t n = 0; n < samples_.size(); ++n) {

      post_prob[n] = new double[label_count_];

    }

  }

};



















int main(int argc, char* argv[]) {

  base::ParseCommandLineFlags(&argc, &argv, false);

  TrainningDataSet  tds;

  tds.LoadSamplesFromFile(FLAGS_train_path);

  tds.Dump();

  tds.Train();



  return 0;

}

测试样例：

A 1:0.20 2:0.70

A 1:0.10 2:0.80

A 1:0.30 2:0.60

A 1:0.05 2:0.94

A 1:0.77 2:0.22

A 1:0.44 2:0.55

B 1:0.20 2:0.81

B 1:0.30 2:0.71

B 1:1.00 2:0.01

B 1:0.50 2:0.51

B 1:0.40 2:0.65

B 1:0.70 2:0.40

结果：

I0930 09:53:14.443656 32046 lr.cc:100] sampel predict [A]: 0.926048
I0930 09:53:14.443763 32046 lr.cc:100] sampel predict [A]: 0.932346
I0930 09:53:14.443836 32046 lr.cc:100] sampel predict [A]: 0.919215
I0930 09:53:14.443896 32046 lr.cc:100] sampel predict [A]: 0.592285
I0930 09:53:14.443940 32046 lr.cc:100] sampel predict [A]: 0.421599
I0930 09:53:14.443987 32046 lr.cc:100] sampel predict [A]: 0.499967
I0930 09:53:14.444035 32046 lr.cc:100] sampel predict [B]: 0.569759
I0930 09:53:14.444111 32046 lr.cc:100] sampel predict [B]: 0.593065
I0930 09:53:14.444164 32046 lr.cc:100] sampel predict [B]: 0.740223
I0930 09:53:14.444211 32046 lr.cc:100] sampel predict [B]: 0.638351
I0930 09:53:14.444258 32046 lr.cc:100] sampel predict [B]: 0.816627
I0930 09:53:14.444300 32046 lr.cc:100] sampel predict [B]: 0.955107

中间的几个点很接近，所以 prob 不是那么高~