C++ 实现text格式数据集的读取

ByteVortex

于 2023-04-11 09:32:33 发布

阅读量181

点赞数

文章标签： c++ 算法开发语言机器学习

本文链接：https://blog.csdn.net/weixin_44043309/article/details/130075962

版权

这段代码主要展示了如何使用C++读取文件的行数和列数，以及对数据进行标准化（归一化）处理。在读取文件时，分别计算了以换行符分隔的行数和通过流提取的列数。此外，还包含了内存管理和数据读取的函数，如读取MNIST数据集的特征和标签，以及对数据进行标准差归一化。

摘要由CSDN通过智能技术生成

读取行

// 逐行读取文件计数，得到输入文件的行数
    int getFileRows(const char *fileName) {
        std::ifstream fileStream;
        std::string tmp;
        int count = 0;// 行数计数器
        fileStream.open(fileName);
        if (fileStream.fail()) {//文件打开失败:返回0
            return 0;
        } else {//文件存在
            while (getline(fileStream, tmp, '\n')) {//读取一行
                if (tmp.size() > 0)
                    count++;
            }
            fileStream.close();
            return count;
        }
    }

读取列

// 逐行读取文件计数，得到输入文件的列数
    int getFileColumns(const char *fileName) {
        std::ifstream fileStream;
        fileStream.open(fileName);

        double tmp = 0;
        int count = 0;    // 列数计数器
        char cc, aa;            //当前位置的字符
        cc = fileStream.peek();
        while (('\n' != cc) && (!fileStream.eof())) { // 指针指向的当前字符，仅观测，不移动指针位置
            fileStream >> tmp;
            ++count;
            cc = fileStream.peek();
            if ('\n' != cc) {
                fileStream >> aa;
            }
        }
        fileStream.close();
        return count;
    }

读取训练数据

// 逐行读取文件计数，得到输入文件的列数
    int getFileColumns(const char *fileName) {
        std::ifstream fileStream;
        fileStream.open(fileName);

        double tmp = 0;
        int count = 0;    // 列数计数器
        char cc, aa;            //当前位置的字符
        cc = fileStream.peek();
        while (('\n' != cc) && (!fileStream.eof())) { // 指针指向的当前字符，仅观测，不移动指针位置
            fileStream >> tmp;
            ++count;
            cc = fileStream.peek();
            if ('\n' != cc) {
                fileStream >> aa;
            }
        }
        fileStream.close();
        return count;
    }

释放内存

 void MNIST::FreeData(COMPUTE_TYPE **x, COMPUTE_TYPE **y) {
        if (x && *x) delete[] *x;
        if (y && *y) delete[] *y;
    }

    void MNIST::FreeData(double **x, double **y) {
        if (x && *x) delete[] *x;
        if (y && *y) delete[] *y;
    }

读取测试数据

 int MNIST::ReadTestData(double **x, double **y) {
        MPC_OPTION &mp = *MPC_OPTION::GetMPCOption();
        int nFeatures, nLabels;
        mp.local_n = getFileRows((const char *) mp.test_feature_path);
        mp.local_d -= 1;
        nFeatures = ReadFeatures(x, (const char *) mp.test_feature_path);
        nLabels = ReadLabels(y, (const char *) mp.test_label_path);
        if (nFeatures != mp.d || nLabels <= 0) return -1;
        mp.nTest = nLabels;
        mp.d = nFeatures;
        return 0;
    }

读取特征

 int MNIST::ReadFeatures(double **x, const char *filename) {
        MPC_OPTION &mp = *MPC_OPTION::GetMPCOption();
        if (x == NULL) return -1;

        //judge if input number of n and d out of the range of dataset
        int realFeatures = getFileColumns(filename);
        int reallables = getFileRows(filename);
        int number_of_features = mp.local_d;
        int number_of_lables = mp.local_n;
        if (number_of_features > realFeatures || number_of_features <= 0) {
            Log(DEBUG_INFO, "input data_d out of range");
            return -1;
        }
        if (number_of_lables > reallables || number_of_lables <= 0) {
            Log(DEBUG_INFO, "input data_n out of range");
            return -1;
        }

        std::ifstream fileStream;
        fileStream.open(filename);

        if (!fileStream) {
            Log(DEBUG_INFO, "[ReadFeatures] cannot open file:%s\n", filename);
            //std::cout<<"Unable to open file";
            return -1;
        }
        if (fileStream.is_open()) {
            double tmp = 0;    //当前位置上的数值
            int colCount = 0;            // 列数计数器
            int index = 0;        // X[]数组的下标
            int maxIndex = (number_of_features + 1) * number_of_lables;
            char douhao;
            double *xx = new double[maxIndex];
            while (!fileStream.eof()) {
                fileStream >> tmp;
                if (index < maxIndex) {
                    xx[index] = tmp;
                    //Log(DEBUG_INFO, "tmp: %.4f.", xx[index]);
                    index++;
                } else break;

                if (colCount + 1 != number_of_features && '\n' != fileStream.peek()) // 未到行尾，colCount累加
                {
                    ++colCount;
                    fileStream >> douhao;

                } else //已到行尾，colCount清零
                {
                    colCount = 0;    // 列数清零
                    while ('\n' != fileStream.peek()) {
                        fileStream >> douhao;
                        fileStream >> tmp;
                    }
                    xx[index] = 1;
                    index++;
                }
            }
            double *xxx = new double[maxIndex];
            stdnorm(xxx, xx);
            *x = new double[maxIndex];
            for (int i = 0; i < maxIndex; i++) {
                (*x)[i] = xxx[i] * (double) (1 << mp.fraction_bits);  //turn xx to xxx can do stdnorm
            }
            //DoubleToFixPoint(*x, xxx, maxIndex);

            // 关闭文件
            delete[]xx;
            delete[]xxx;
            fileStream.close();
            return number_of_features + 1;
        }
        return 0;
    }

读取标签

 int MNIST::ReadLabels(double **y, const char *filename) {
        MPC_OPTION &mp = *MPC_OPTION::GetMPCOption();
        if (y == NULL) return -1;

        //judge if input number of n out of the range of dataset
        int reallables = getFileRows(filename);
        int number_of_lables = mp.local_n;
        if (number_of_lables > reallables || number_of_lables <= 0) {
            Log(DEBUG_INFO, "input data_n out of range");
            return -1;
        }


        std::ifstream fileStream;
        fileStream.open(filename);

        if (!fileStream) {
            Log(DEBUG_INFO, "[ReadFeatures] cannot open file:%s\n", filename);
            //std::cout<<"Unable to open file";
            return -1;
        }
        if (fileStream.is_open()) {
            double tmp = 0;    //当前位置上的数值
            int colCount = 0;            // 列数计数器
            int index = 0;        // X[]数组的下标
            int maxIndex = mp.local_n;
            char douhao;
            double *yy = new double[maxIndex];
            while (!fileStream.eof()) {
                fileStream >> tmp;
                if (index < maxIndex) {
                    yy[index] = tmp;
                    //cout << (*X)[index] << endl;
                    index++;
                } else {
                    break;
                }

                if (colCount + 1 != 1 && '\n' != fileStream.peek()) // 未到行尾，colCount累加
                {
                    ++colCount;
                    fileStream >> douhao;

                } else //已到行尾，colCount清零
                {
                    colCount = 0;    // 列数清零
                    while ('\n' != fileStream.peek()) {
                        fileStream >> douhao;
                        fileStream >> tmp;
                    }
                }
            }
            *y = new double[maxIndex];
            for (int i = 0; i < maxIndex; i++) {
                (*y)[i] = yy[i] * (double) (1 << mp.fraction_bits);
            }
            //DoubleToFixPoint(*y, yy, maxIndex);
            // 关闭文件
            fileStream.close();
            delete[]yy;
            return number_of_lables;
        }
        return 0;
    }

数据标准化（归一化）

void MNIST::stdnorm(double *out, double *in) {
        MPC_OPTION &mp = *MPC_OPTION::GetMPCOption();
        double *ave = new double[mp.local_d];
        memset(ave, 0, mp.local_d * sizeof(double));
        for (int i = 0; i < mp.local_d; i++) {
            for (int j = 0; j < mp.local_n; j++) {
                ave[i] += in[j * (mp.local_d + 1) + i];
            }
            ave[i] /= mp.local_n;
        }
        //for(int i=0 ;i<mp.local_d;i++){
        //	Log(DEBUG_INFO, "mean per Features:%.4f ", ave[i]);
        //}

        double *sqrt1 = new double[mp.local_d];
        memset(sqrt1, 0, mp.local_d * sizeof(double));
        for (int i = 0; i < mp.local_d; i++) {
            for (int j = 0; j < mp.local_n; j++) {
                sqrt1[i] += ((in[j * (mp.local_d + 1) + i] - ave[i]) * (in[j * (mp.local_d + 1) + i] - ave[i]));
            }
            sqrt1[i] /= mp.local_n;
        }

        //for(int i=0 ;i<mp.local_d ;i++){
        //	Log(DEBUG_INFO, "sqrt per Features:%.4f ", sqrt1[i]);
        //}

        for (int i = 0; i < (mp.local_d + 1); i++) {
            for (int j = 0; j < mp.local_n; j++) {
                if (i == mp.local_d) {
                    out[j * (mp.local_d + 1) + i] = 1;
                } else if (sqrt1[i] == 0) out[j * (mp.local_d + 1) + i] = in[j * (mp.local_d + 1) + i];
                else out[j * (mp.local_d + 1) + i] = (in[j * (mp.local_d + 1) + i] - ave[i]) / sqrt1[i];
            }
        }

        //for(int i=0 ;i<22 ;i++){  
        //	Log(DEBUG_INFO, "22 changed data:%.4f ", out[i]);
        //}

        delete[]ave;
        delete[]sqrt1;
    }