朴素贝叶斯算法-C++实现+拉普拉斯平滑+算法优化

最新推荐文章于 2024-01-04 00:02:24 发布

程人之美

最新推荐文章于 2024-01-04 00:02:24 发布

阅读量2.9k

点赞数 1

分类专栏：人工智能文章标签：朴素贝叶斯拉普拉斯平滑

本文链接：https://blog.csdn.net/ko_tin/article/details/52903513

版权

人工智能专栏收录该内容

11 篇文章 0 订阅

订阅专栏

分类：

#include <iostream>
#include <algorithm>
#include <iomanip>
#include <fstream>
#include <sstream>
#include <cstring>
#include <string>
#include <vector>
#include <cmath>
#include <time.h>
#include <map>

using namespace std;


struct train_data {
    int index;                      //训练文本序号
    int emotion_value;              //情感值
    string emotion;                 //情感状态
    vector<string> word;            //训练文本单词
    int onehot[1000];               //onehot矩阵中的值
    double distance;                //距离

    train_data(int a = 0, int b = 0, string c = "", double d = 0.0) {
        index = a;
        emotion_value = b;
        emotion = c;
        distance = d;
        word.clear();
        for (int i = 0; i < 1000; i ++)
            onehot[i] = 0;
    }
};

vector<string> train_text;          //每个完整的训练文本
vector<string> all_words;           //所有不同的单词 ，纵轴
vector<train_data> all_trains;      //所有训练文本，横轴
int right_sum;                      //预测正确的个数


//NB adding part
struct each_word {
    string name;
    double time;
    each_word(string a = "", double b = 0) {
        name = a;
        time = b;
    }
};
//构建词带

struct emotion {
    string emotion2;
    vector<each_word> word_bag;
    double sum_words;
    double probability;
    double times;
    emotion(double a = 0, double b = 0, double c = 0) {
        word_bag.clear();
        sum_words = a;
        probability = b;
        times = c;
    }
};

emotion each_emotion[7];  



//NB adding part

// youhua part

double add[7] = {0};

//youhua part

void reading_file(void );
void class_calculating();
bool cmp(const emotion & , const emotion & );

int main() {

    train_text.clear();
    all_words.clear();
    all_trains.clear();

    each_emotion[1].emotion2 = "anger";
    each_emotion[2].emotion2 = "disgust";
    each_emotion[3].emotion2 = "fear";
    each_emotion[4].emotion2 = "joy";
    each_emotion[5].emotion2 = "sad";
    each_emotion[6].emotion2 = "surprise";
    reading_file();
    class_calculating();


    return 0;
}

void reading_file() {
    ifstream train("train.txt");
    char read[100];
    string temp;
    train.getline(read, 100);
    while (!train.eof()) {
        train.getline(read, 100);
        temp = read;
        train_text.push_back(temp);
    }
    train.close();
    stringstream s;
    int index;
    int emotion_value;
    string emotion;
    string word;
    for (int i = 0; i < train_text.size(); i ++) {
        s.str(train_text[i]);
        s >> index;
        s >> emotion_value;
        s >> emotion;
        each_emotion[emotion_value].times ++;
        while (s != NULL) {
            s >> word;
            each_emotion[emotion_value].sum_words ++;
            bool flag = true;
            for (int j = 0; j < each_emotion[emotion_value].word_bag.size(); j ++) {
                if (each_emotion[emotion_value].word_bag[j].name == word) {
                    each_emotion[emotion_value].word_bag[j].time ++;
                    flag = false;
                    break;
                }
                else
                    continue;
            }
            if (flag) {
                each_word new_each_word(word, 1);
                each_emotion[emotion_value].word_bag.push_back(new_each_word);
            }

            bool flag1 = true;
            for (int i = 0; i < all_words.size(); i ++) {
                if (all_words[i] == word) {
                    flag1 = false;
                    break;                  
                }
                else 
                    continue;
            }
            if (flag1)
                all_words.push_back(word);
        }
        s.clear();
    }
}

void class_calculating() {
    ifstream t("test.txt");
    right_sum = 0;
    char c[100];
    string temp;
    t.getline(c, 100);
    while (t.getline(c, 100)) {
        train_data test_train;
        char *p = strtok(c, " ");
        p = strtok(NULL, " ");
        p = strtok(NULL, " ");
        temp = p;
        test_train.emotion = temp;
        int new_index;
        for (new_index = 1; new_index < 7; new_index ++) {
            if (each_emotion[new_index].emotion2 == temp)
                break;
        }
        p = strtok(NULL, " ");

        while (p != NULL) {
            temp = p;
            test_train.word.push_back(temp);

//          each_emotion[new_index].sum_words ++;
//          bool flag = true;
//          for (int j = 0; j < each_emotion[new_index].word_bag.size(); j ++) {
//                if (each_emotion[new_index].word_bag[j].name == temp) {
//                    each_emotion[new_index].word_bag[j].time ++;
//                    flag = false;
//                    break;
//                }
//                else
//                    continue;
//          }
//          if (flag) {
//                each_word new_each_word(temp, 1);
//                each_emotion[new_index].word_bag.push_back(new_each_word);
//          }
            bool flag1 = true;
            for (int i = 0; i < all_words.size(); i ++) {
                if (all_words[i] == temp) {
                    flag1 = false;
                    break;                  
                }
                else 
                    continue;
            }
            if (flag1)
                all_words.push_back(temp);
            p = strtok(NULL, " ");
        }

        for (int i = 1; i <= 7; i ++) {
            double pro = each_emotion[i].times / train_text.size();
            for (int j = 0; j < test_train.word.size(); j ++) {
                bool flag = true;
                for (int k = 0; k < each_emotion[i].word_bag.size(); k ++) {
                    if (each_emotion[i].word_bag[k].name == test_train.word[j]) {
                        pro *= (each_emotion[i].word_bag[k].time / (each_emotion[i].sum_words + all_words.size()));
                        flag = false;
                        break;
                    }
                    else 
                        continue;
                }
                if (flag) {
                    pro *= (1.0 / (each_emotion[i].sum_words + all_words.size()));
                }
            }
            each_emotion[i].probability = pro * (1.0 / (abs(each_emotion[i].times - train_text.size())));
        }

        sort(each_emotion + 1, each_emotion + 7, cmp);
        cout << each_emotion[6].emotion2 << endl;
        if (each_emotion[6].emotion2 == test_train.emotion) {
            right_sum ++;       
        } 
    }
    cout << "正确个数: " << right_sum << endl;
//  for (int i = 1; i < 7 ; i ++)
//      cout << each_emotion[i].times << endl;
}

bool cmp(const emotion &a, const emotion & b) {
    return a.probability < b.probability
}

回归：

#include <iostream>
#include <algorithm>
#include <iomanip>
#include <fstream>
#include <sstream>
#include <cstring> 
#include <string>
#include <vector>
#include <cmath>
#include <time.h>
#include <map>

using namespace std;

struct ct {                  //每个训练样本中的每个单词的数据 
    string s;
    double num;
    ct(string a = "", double b = 0) {
        s = a;
        num = b;
    }
};

struct train_data {
    int index;                      //训练文本序号 
    int emotion_value;              //情感值 
    string emotion;                 //情感状态 
    vector<ct> word;            //训练文本单词 
    int onehot[1000];           //onehot矩阵中的值 
    double distance;                //距离 
    double sum;
    vector<double> fre_set; 
    vector<double> tf;

    train_data(int a = 0, int b = 0, string c = "", double d = 0.0, double e = 0) {
        index = a;
        emotion_value = b;
        emotion = c;
        distance = d;
        sum = e;
        word.clear();
        fre_set.clear();
        for (int i = 0; i < 1000; i ++)
            onehot[i] = 0;
    }
};



vector<string> train_text;          //每个完整的训练文本 
vector<string> all_words;           //所有不同的单词 ，纵轴 
vector<train_data> all_trains;      //所有训练文本，横轴 
int right_sum;                      //预测正确的个数 

void reading_file(void );
void get_TF(void );
void regre_calculating(void );


int main() {
    train_text.clear();
    all_words.clear();
    all_trains.clear();

    reading_file();
    //cout << all_words.size() << endl; 904
    get_TF();
    regre_calculating();

    //for (int i = 0; i <= all_trains[2].onehot.size(); i ++)
    //  cout << all_trains[2].onehot[i] << endl;

    return 0;
}

void reading_file() {
    ifstream t("Dataset_train.csv");    
    char c[150];
    string temp;
    t.getline(c, 150);
    while (t.getline(c, 150)) {
        train_data new_train;
        char d[150];
        strcpy(d, c);
        char *p = strtok(c, ",");
        p = strtok(NULL, ",");
        //cout << p << endl;
        char *p2 = strtok(p, " ");
        while (p2 != NULL) {
            new_train.sum ++;
            string word = p2;
            //统计所有单词
            bool flag1 = true;
            for (int i = 0; i < all_words.size(); i ++) {
                if (all_words[i] == word) {
                    flag1 = false;
                    break;                  
                }
                else 
                    continue;
            }
            if (flag1)
                all_words.push_back(word);

            //统计每个训练文本中的单词
            bool flag2 = true;
            for (int i = 0; i < new_train.word.size(); i ++) {
                if (new_train.word[i].s == word) {
                    new_train.word[i].num ++;
                    flag2 = false;
                    break;
                }
                else 
                    continue;
            }
            if (flag2) {
                ct new_ct(word, 1);
                new_train.word.push_back(new_ct);   
            }

            p2 = strtok(NULL, " ");
        }
        char *p3 = strtok(d, ",");
        p3 = strtok(NULL, ",");
        p3 = strtok(NULL, ",");
        stringstream ss;
        double fre;
        while (p3 != NULL) {
            temp = p3;
            ss.str(temp);
            ss >> fre;
            new_train.fre_set.push_back(fre);
            ss.clear();
            p3 = strtok(NULL, ",");
        }

        all_trains.push_back(new_train);                
    }

}


void get_TF() {
    for (int i = 0; i < all_trains.size(); i ++) {
        for (int j = 0; j < all_words.size(); j ++) {
            bool flag = true;
            for (int k = 0; k < all_trains[i].word.size(); k ++) {
                if (all_words[j] == all_trains[i].word[k].s) {
                    all_trains[i].tf.push_back(all_trains[i].word[k].num/all_trains[i].sum);
                    flag = false;
                    break;
                }
            }
            if (flag)
                all_trains[i].tf.push_back(0);
        }
    }
}

void regre_calculating() {
    ifstream t("Dataset_validation.csv");
    char c[150];
    string temp;
    t.getline(c, 150);
    ofstream out("14353324_xiangketing_regression.txt");
    while (t.getline(c, 150)) {
        train_data test_train;
        char *p = strtok(c, ",");
        p = strtok(NULL, ",");
        //cout << p << endl;
        char *p2 = strtok(p, " ");
        while (p2 != NULL) {
            test_train.sum ++;
            temp = p2;
            bool flag = true;
            for (int i = 0; i < test_train.word.size(); i ++) {
                if (test_train.word[i].s == temp) {
                    test_train.word[i].num ++;
                    flag = false;
                    break;
                }
            }
            if (flag) {
                ct new_ct(temp, 1);
                test_train.word.push_back(new_ct);
            }

            p2 = strtok(NULL, " ");
        }
        map<string, double> m;
        for (int i = 0; i < test_train.word.size(); i ++) {
            m[test_train.word[i].s] = test_train.word[i].num / test_train.sum;
        }

        map<int, double> fre;
        fre[1] = 0; fre[2] = 0; fre[3] = 0;
        fre[4] = 0; fre[5] = 0; fre[6] = 0;
        for (int i = 0; i < all_trains.size(); i ++) {
            double temp_fre = 1;
            vector<double> v;
            for (int j = 0; j < test_train.word.size(); j ++) {
                bool flag = true;
                for (int k = 0; k < all_words.size(); k ++) {
                    if (test_train.word[j].s == all_words[k]) {
                        if (all_trains[i].tf[k] != 0) 
                            v.push_back(all_trains[i].tf[k]);   
                        else 
                            v.push_back(1.0 / (all_trains[i].sum + all_words.size())); 
                        flag = false;
                        break;
                    }
                }
                if (flag)
                    v.push_back((1.0 + test_train.word[j].num) / (test_train.sum + all_words.size()));
            }
            for (int t = 0; t < v.size(); t ++) {
                temp_fre *= v[t];
            }
            fre[1] += temp_fre * all_trains[i].fre_set[0];
            fre[2] += temp_fre * all_trains[i].fre_set[1];
            fre[3] += temp_fre * all_trains[i].fre_set[2];
            fre[4] += temp_fre * all_trains[i].fre_set[3];
            fre[5] += temp_fre * all_trains[i].fre_set[4];
            fre[6] += temp_fre * all_trains[i].fre_set[5];

        }
        double all_fre = fre[1] + fre[2] + fre[3] + fre[4] + fre[5] + fre[6];


        out << fre[1] / all_fre << '\t' << fre[2] / all_fre << '\t'
            << fre[3] / all_fre << '\t' << fre[4] / all_fre << '\t'
            << fre[5] / all_fre << '\t' << fre[6] / all_fre << endl;
    }
}