实习笔记Day13(2022.8.22)

最新推荐文章于 2024-08-22 17:27:46 发布

歌者｀

最新推荐文章于 2024-08-22 17:27:46 发布

阅读量174

点赞数

分类专栏： 2022暑期实习笔记文章标签：算法

本文链接：https://blog.csdn.net/m0_52774249/article/details/126463354

版权

2022暑期实习笔记专栏收录该内容

13 篇文章 1 订阅

订阅专栏

一.总结

研究了一下我读的第一篇论文里的开源代码的 k-means 聚类部分

二.学习笔记

1.kmeans.h文件

#ifndef _OPENSOURCE_PSC_KMEANS_H__
#define _OPENSOURCE_PSC_KMENAS_H__

#include "common.h"

namespace learning_psc {
class KMeans {
 public:
  KMeans();   //构造函数
  // 读取输入文件并存储上一步的特征向量信息
  void Read(const string& filename);
  // 进行k-means聚类
  void DoKMeans(int num_clusters,
                const string& kmeans_initialize_method,
                int kmeans_max_loop,
                double kmeans_threshold);
  // 写聚类的结果
  void Write(const string& output_path);
  // For unitest, get cluster result.
  const vector<int>& local_memberships() { return local_memberships_; }
 private:
  // 初始化聚类中心
  void InitializeCenters(const string& kmeans_initialize_method);
  //以下各个参数的意义和函数的作用见后续各个函数的剖析部分
  void GeneratePermutation(int length, int buf_size,
                           int seed, vector<int> *perm);
  void KMeansClustering(int kmeans_max_loop, double kmeans_threshold);
  void FindNearestCluster(int data_point_index,
                          int *center_index,
                          double *min_distance);
  double DotProduct(const vector<double>& v1, const double* v2);
  double DotProduct(const vector<double>& v1, const vector<double>& v2);
  double ComputeDistance(const vector<double> &data_point_1,
                         const vector<double> &data_point_2);
  double ComputeDistance(const vector<double> &data_point_1,
                         const double* data_point_2);
  vector<vector<double> > local_rows_;
  int num_total_rows_;
  int my_start_row_index_;
  vector<int> row_count_of_each_proc_;
  int num_clusters_;
  int num_columns_;
  vector<double> cluster_centers_storage_;
  vector<double*> cluster_centers_;
  vector<int> cluster_sizes_;
  vector<int> local_memberships_;

  int myid_;
  int pnum_;
};

}  // namespace learning_psc

#endif  // _OPENSOURCE_PSC_KMEANS_H__

2.kmeans.cc文件（部分）

（1）构造函数 KMeans

#include <cfloat>
#include "common.h"
#include "kmeans.h"

namespace learning_psc {
KMeans::KMeans()
    : num_total_rows_(0),
      my_start_row_index_(0) {
  MPI_Comm_rank(MPI_COMM_WORLD, &myid_);
  MPI_Comm_size(MPI_COMM_WORLD, &pnum_);
}
//构造函数对部分类成员初始化，myid是当前的进程号，pnum是总的进程数

（2）输入数据读取函数 Read

void KMeans::Read(const string& filename) {
  num_total_rows_ = 0;
  if (myid_ == 0) {   //主进程
    ifstream fin2(filename.c_str());
    //c_str方法是为了和C语言兼容，返回内容和filename一样，
    不过是C中字符串的样式，而不再是string类型
    string line;
    while (getline(fin2, line)) {  // 每一行就是一个训练文件
      if (line.size() > 0 &&      // 跳过空行
        line[0] != '\r' &&        // 跳过空行
        line[0] != '\n' &&        // 跳过空行
        line[0] != '#') {         // 跳过注释行
        ++num_total_rows_;   //num_total_rows记录读取的总行数
      }
    }
  }
  MPI_Bcast(&num_total_rows_, 1, MPI_INT, 0, MPI_COMM_WORLD);
  // 把读取的总行数信息广播给所有进程（包括0号进程自己）

  // 计算每一台电脑有多少行数据
  // row_count_of_each_proc_是一个元素类型为int的vector
  // 将row_count_of_each_proc_设置为p_num个元素0
  row_count_of_each_proc_.resize(pnum_, 0);
  // 每台机器平均分配行数，不能整除的依次一行一行地加
  for (int i = 0; i < pnum_; ++i) {
    row_count_of_each_proc_[i] = num_total_rows_ / pnum_ +
    (num_total_rows_ % pnum_ > i ? 1 : 0);
  }

  // 根据子进程号确定该进程开始读取数据的行坐标（从0开始）
  my_start_row_index_ = 0;
  for (int i = 0; i < myid_; ++i) {
    my_start_row_index_ += row_count_of_each_proc_[i];
  }
  // 读取该子进程应该读的部分
  ifstream fin2(filename.c_str());
  string line;
  int i = 0;
  while (getline(fin2, line)) {  // 每一行就是一个训练文件
    if (line.size() > 0 &&      // 跳过空行
        line[0] != '\r' &&      // 跳过空行
        line[0] != '\n' &&      // 跳过空行
        line[0] != '#') {       // 跳过注释行
      // 不属于该子进程的行直接跳过
      if (i < my_start_row_index_ || i >= my_start_row_index_ +                 
      row_count_of_each_proc_[myid_]) {
        ++i;
        continue;
      }
      istringstream ss(line);
      vector<double> row;
      //row的每一个元素是该行中以空格分隔的一个个double类型的值
      //也就是该向量的每一维分量
      double value;
      while (ss >> value) {  // Load and init a document.
        row.push_back(value);
      }
      local_rows_.push_back(row);
      //local_rows_的每一个元素是属于当前子进程的每一行数据（vector类型）
      ++i;
    }
  }
  // 检查每一行数据是不是相同维度的
  for (int i = 1; i < local_rows_.size(); ++i) {
    CHECK_EQ(local_rows_[0].size(), local_rows_[i].size());
  }
  num_columns_ = local_rows_[0].size();
}

（3）聚类结果写回函数 Write

void KMeans::Write(const string& filename) {
  for (int p = 0; p < pnum_; ++p) {
    if (p == myid_) {
      std::ofstream fout;
      if (p == 0) {
        fout.open(filename.c_str());
      } else {
        fout.open(filename.c_str(), std::ios::app);
        //std::ios::app表示如果没有文件则生成空文件；如果有则追加
      }
      for (int i = 0; i < local_memberships_.size(); ++i) {
        fout << local_memberships_[i] << std::endl; //？
      }
      fout.close();
    }
    MPI_Barrier(MPI_COMM_WORLD);
    //同步，只有当前循环的工作全部完成，才进入下一次循环
  }
}

（4）两个同维向量点积函数 DotProduct

double KMeans::DotProduct(const vector<double>& v1, const double* v2) {
  double norm_sq = 0.0;
  for (int i = 0; i < v1.size(); ++i) {
    norm_sq += v1[i] * v2[i];
  }
  return norm_sq;
}
double KMeans::DotProduct(const vector<double>& v1, const vector<double>& v2) {
  double norm_sq = 0.0;
  for (int i = 0; i < v1.size(); ++i) {
    norm_sq += v1[i] * v2[i];
  }
  return norm_sq;
}

（5）质心初始化函数 InitializeCenters

void KMeans::InitializeCenters(const string& kmeans_initialize_method) {
  // 单位化特征向量，特征向量存储在local_rows_中
  for (int i = 0; i < local_rows_.size(); ++i) {
    double norm_i = DotProduct(local_rows_[i], local_rows_[i]);
    if (norm_i != 0) {
      norm_i = sqrt(norm_i);
      for (int j = 0; j < local_rows_[i].size(); ++j) {
        local_rows_[i][j] /= norm_i;
      }
    }
  }

  //初始化
  cluster_centers_storage_.resize(num_clusters_ * num_columns_, 0);
  cluster_centers_.resize(num_clusters_, 0);
  for (int i = 0; i < num_clusters_; ++i) {
    cluster_centers_[i] = &(cluster_centers_storage_[i * num_columns_]);
  }
  //总共num_clusters_个聚类中心，它们顺序地存储在cluster_centers_storage_中
  //cluster_centers_[i]存储的是第i个聚类中心的起始地址

  //选择方案一：每个初始聚类中心是两两正交的
  if (kmeans_initialize_method == "orthogonal_centers") {
    // 先随机选取一行作为初始的第一个聚类中心
    int rand_index = 0;
    if (myid_ == 0) { //主进程
      //static的作用是把double转换为int
      rand_index =
          static_cast<int>(RandDouble()
                           * local_rows_.size());
      //初始化随机选取的第一个聚类中心点
      for (int i = 0; i < num_columns_; ++i) {
        cluster_centers_[0][i] = local_rows_[rand_index][i];
      }
    }
    //把随机选取的第一个聚类中心点信息广播给所有进程
    MPI_Bcast(&cluster_centers_[0][0],
              num_columns_,
              MPI_DOUBLE,
              0, MPI_COMM_WORLD);
    //剩下的num_clusters个聚类中心点的选取方法：
    //对于每一个子进程，都要：
    //遍历该子进程管理的每一个特征向量，选择与上一个聚类中心点积绝对值最小者作为候选
    //然后收集所有子进程的候选者，再比较出点积最小的那个，作为下一个聚类中心
    vector<double> additional_vector(local_rows_.size(), 0);
    for (int k = 1; k < num_clusters_; ++k) {
      int orthogonal_angle_index = -1;
      double orthogonal_angle = FLT_MAX;
      for (int i = 0; i < local_rows_.size(); ++i) {
        additional_vector[i] +=
            fabs(DotProduct(local_rows_[i],
                            cluster_centers_[k - 1]));
        if (additional_vector[i] < orthogonal_angle) {
          orthogonal_angle = additional_vector[i];
          orthogonal_angle_index = i;
        }
      }
      //收集候选，比较
      IndexValue local_id_angle, global_id_angle;
      local_id_angle.index = myid_;
      local_id_angle.value = orthogonal_angle;
      //规约，找出点积最小的那个，下标记录在global_orthogonal_proc_index中
      MPI_Allreduce(&local_id_angle,
                    &global_id_angle,
                    1,
                    MPI_DOUBLE_INT,
                    MPI_MINLOC, MPI_COMM_WORLD);
      int global_orthogonal_proc_index = global_id_angle.index;
      //存储新找到的聚类中心的信息
      if (myid_ == global_orthogonal_proc_index) {
        for (int i = 0; i < num_columns_; ++i) {
          cluster_centers_[k][i] =
              local_rows_[orthogonal_angle_index][i];
        }
      }
      //把新找到的聚类中心的信息广播给所有进程
      MPI_Bcast(&cluster_centers_[k][0],
                num_columns_,
                MPI_DOUBLE,
                global_orthogonal_proc_index, MPI_COMM_WORLD);
    }
    for (int i = 0; i < num_clusters_; ++i) {
      cluster_sizes_.push_back(0);
    }
  } 
  //选择方案二：每个初始聚类中心是随机生成的
  else if (kmeans_initialize_method == "random") {
    vector<int> random_permutation_index(num_total_rows_);
    if (myid_ == 0) {
    //主进程调用GeneratePermutation得到random_permutation_index数组
      GeneratePermutation(num_total_rows_,
                          50,
                          0,
                          &random_permutation_index);
    }
    //把random_permutation_index数组信息广播给所有进程
    MPI_Bcast(&random_permutation_index[0],
              num_total_rows_,
              MPI_INT,
              0, MPI_COMM_WORLD);
    for (int i = 0; i < num_clusters_; ++i) {
      for (int j = 0; j < num_columns_; ++j) {
        double value;
        //random_permutation_index[i]代表随机生成的第i个初始聚类中心的行号
        //在规定范围内时，找到质心的信息，存储进cluster_centers_数组中
        if (random_permutation_index[i] >= my_start_row_index_ &&
            random_permutation_index[i] < my_start_row_index_ +
                                          local_rows_.size()) {
          value = local_rows_[random_permutation_index[i] -
                  my_start_row_index_][j];
        } else {
          value = 0;
        }
        cluster_centers_[i][j] = value;
      }
      cluster_sizes_.push_back(0);
    }
    vector<double> cluster_centers_storage_backup(num_clusters_ * num_columns_);
    //这里用了一个cluster_centers_storage_backup数组记录了质心和，
      估计是为了后续聚类时更新质心所用
    MPI_Allreduce(&cluster_centers_storage_[0],
                  &cluster_centers_storage_backup[0],
                  num_clusters_ * num_columns_,
                  MPI_DOUBLE,
                  MPI_SUM, MPI_COMM_WORLD);
    memcpy(&cluster_centers_storage_[0],
           &cluster_centers_storage_backup[0], num_clusters_ * num_columns_);
  
  } 
  else {
    LOG(FATAL) << "No such an initialization method: "
               << kmeans_initialize_method << std::endl;
  }
}