#include <iostream>
#include <cmath>
#include <vector>
#include <set>
#include <map>
#include <string>
#include <sstream>
#include <algorithm>
#include <queue>
#include <fstream>
using namespace std;
/* 数据集有两种类型
数据集类型1:
类别名称1 类别名称2 ... 类别名称n 类标签
... ... ... 类标签值1
类标签值2
...
类标签值m
数据集类型2:
测试集Id 测试集内容 测试集属性值
1 data1 data2 .. value1
2
...
m
例子: 微博Id 微博内容 微博转发次数
1 今天 天气 晴 0
2 中大 新校长 10
...
*/
#pragma region TypeName
typedef string String;
//typedef String Label;
typedef vector<String> VS;
typedef vector<int> VI;
#pragma endregion
#pragma region Global Variables
const static int gRow = 8, gCol = 5; // 7 + 1, 4 + 1
VS labels; // 类别名称
VI labelsId; // 类别名称对应的Id
map<int, string> id2labels; // 映射类别Id和类别名称
map<int, string> id2class; // 最后一列中,用Id映射到类标签的不同值
// 设计的不好,应该针对每一列都用一个map<int, string>来映射他们的属性值。
#pragma endregion
#pragma region ClassType
class DataSet{ // matrix built by vector
public:
DataSet(){
this->row = 0;
this->col = 0;
}
// 修改了方法
DataSet(int row, int col){
this->resize(row, col);
}
// 作死的补充了方法
void resize(int row, int col){
this->col = col;
this->row = row;
dataSet.clear();
for (int i = 0; i < row; i++) {
VI v(col);
dataSet.push_back(v);
}
i2s.resize(col);
}
// 这个重载太精彩.. 这样我在外部调用时就可以使用dataSet[][]
vector<int>& operator[](int row){
return dataSet[row];
}
// 以上是新修改或者添加的部分
void push_back(VI v) { // 这个方法不能更新i2s属性
this->row++;
this->dataSet.push_back(v);
}
VI getCol(int id) {
VI v;
for (int i = 0; i < this->row; i++) {
v.push_back(this->dataSet[i][id]);
}
return v;
}
int row;
int col;
vector<VI> dataSet;
vector< map<int, string> > i2s; // 新添加的,用来映射每一列中id->属性名
} dataSet; // 全局创建了这一个对象
class Node{
public:
Node(int label, int value){
this->featureLabelId = label;
this->featureValue = value;
}
void addChild(const Node& node) { child.push_back(node); }
bool isLeafNode()const { return child.empty(); }
Node& getChild(int value){
for (vector<Node> ::iterator it = child.begin(); it != child.end(); it++) {
if ((*it).featureValue == value)
return *it;
}
}
string getClassLabel() const{ // pre: make sure this is leaf node. 叶节点返回类标签
return id2class[this->featureLabelId];
}
int getFeatureLabelId(){
return this->featureLabelId;
}
//bool operator < (const Node& oNode) const{ return this->featureValue < oNode.featureValue; }
void print(){
queue<Node> q;
q.push(*this);
while (!q.empty()){
Node tmpNode = q.front();//
q.pop();
cout << tmpNode.featureValue << ":\"" << id2labels[tmpNode.featureLabelId] << "\"";
for (vector<Node> ::iterator it = tmpNode.begin(); it != tmpNode.end(); it++) {
if( it->isLeafNode()) {
cout << " -> " << it->featureValue << ":\"" << id2class[it->featureLabelId] << "\"";
} else {
cout << " -> " << it->featureValue << ":\"" << id2labels[it->featureLabelId] << "\"";
q.push(*it);
}
} cout << endl;
}
}
private:
int featureLabelId; // 特征标签的下标,如果是叶节点就是类标签的下标(不好的设计)
int featureValue; // featureValue是针对父节点标签的分类值
vector<Node> child;
typename vector<Node> ::iterator begin(){ return child.begin(); }
typename vector<Node> ::iterator end(){ return child.end(); }
};
template <class T>
class Set{
public:
Set(){};
Set(vector<T>& v){ // add key words typename
for (typename vector<T> ::iterator it = v.begin(); it != v.end(); it++){
s.insert(*it);
}
}
typename set<T> ::iterator find(const T& item) { return s.find(item); }
void insert(const T& item) { s.insert(item); }
typename set<T> ::iterator begin(){ return s.begin(); }
typename set<T> ::iterator end(){ return s.end(); }
int size() const{ return s.size(); }
private:
set<T> s;
};
#pragma endregion
#pragma region FunctionPrototype
template <typename T>
String ConvertToString(T);
double log2(double);
template <typename T>
void printVector(vector<T> v);
void printDataSet(DataSet);
double calcShannonEnt(DataSet);
DataSet splitDataSet(DataSet dataSet, int axis, int value);
int chooseBestFeatureToSplit(DataSet d);
int majorityCnt(VI classList);
Node createTree(DataSet dataSet, VI labels, int cvalue);
#pragma endregion
void init_default() { // 自定义测试集
// String str[5][3] = {{"1", "1", "yes"},
// {"1", "1", "yes"},
// {"1", "0", "no"},
// {"0", "1", "no"},
// {"0", "1", "no"}};
dataSet.row = 5;
dataSet.col = 3;
int data[5][3] = { { 1, 1, 1 },
{ 1, 1, 1 },
{ 1, 0, 0 },
{ 0, 1, 0 },
{ 0, 1, 0 }};
for (int i = 0; i < 5; i++) {
size_t count=sizeof(data[i])/sizeof(int);
VI v(data[i], data[i]+count);
dataSet.dataSet.push_back(v);
} printDataSet(dataSet); cout << endl;
labels.push_back("No surfacing");
labelsId.push_back(0);
id2labels[0] = "No surfacing";
labels.push_back("Flippers");
labelsId.push_back(1);
id2labels[1]="Flippers";
labels.push_back("Fish");
labelsId.push_back(2);
id2labels[2] = "Fish";
id2class[1] = "yes";
id2class[0] = "no";
}
void init_exam(){
dataSet.col = gCol;
ifstream file("Dataset_train");
String str[gRow-1]; // 存进文本的每一行,gRow-1是去掉属性名称
{ // 读入第一行属性名称+类标签
string featureTag;
getline(file, featureTag, '\n'); // 读入属性名称行
stringstream ss;
string label;
ss << featureTag;
for(int j = 0; j < gCol; j++){ // 最后一列是类标签
ss >> label;
// cout << "label: " << label << endl; // 测试输出
labels.push_back(label);
id2labels[j] = label;
labelsId.push_back(j);
}
// printVector(labels); // 测试输出
}
for (int row = 0; !file.eof(); row++) {
getline(file, str[row], '\n'); // 读入数据部分的行(即除了属性名称行之外的数据内容)
}
// for (int i = 0; i < gRow; i++) { cout << str[i] << endl; }// 测试用的一个输出
string str_matrix[gRow-1][gCol]; // 没有映射到int类型之前的数据集
for (int i = 0; i < gRow-1; i++) { // 因为第0行是属性名称,所以gRow-1
stringstream ss;
string data;
ss << str[i];
for(int j = 0; j < gCol; j++){ // 最后一列是类标签
ss >> str_matrix[i][j];
}
}
/* for(int i = 0; i < gRow-1; i++){
for(int j = 0; j< gCol; j++){
cout << str_matrix[i][j] << " ";
} cout << endl;
}*/
dataSet.resize(gRow-1, gCol); // 更新大小
// 将str_matrix映射到dataset中,方法是遍历每列
for (int j = 0; j < gCol; j++){
map<string, int> s2i; // string to int
int featureId = 0; // 针对这种特征(列)的特征ID
for(int i = 0; i < gRow-1; i++){ // gRow-1因为第一行是属性名
map<string, int> :: iterator it = s2i.find(str_matrix[i][j]);
if ( it != s2i.end() ){ // 这个属性的值被映射过了
dataSet[i][j] = it->second;
} else { // 新的属性值并未被添加过
s2i.insert(pair<string, int>(str_matrix[i][j], featureId) );
map<int, string>& id2featureValue = dataSet.i2s[j];
id2featureValue[featureId] = str_matrix[i][j]; // 第j列属性的 id->属性值映射
dataSet[i][j] = featureId++;
if (j == gCol-1) { // 最后一列是列标签,特殊处理
id2class[featureId-1] = str_matrix[i][j];
// cout << "featuredId is :" << featureId-1 << " \ndata is :" << str_matrix[i][j] << endl;// 测试用输出
}
}
}
}
file.close();
/*
cout << "dataSet:\n";
printDataSet(dataSet);
cout << "row: " << dataSet.row << "\ncol: " << dataSet.col << endl;
dataSet.getCol(dataSet.col-1);
cout << id2class[0] << " " << id2class[1] << endl;//*/ // 测试输出
}
// 下面的设计不通用
int test_row = 7;
int test_col = 4;
int test_set[7][4];
void loadTestSet(){
// ifstream file("Dataset_test");
int set[7][4] = { {0, 1, 0, 0},
{0, 2, 1, 0},
{2, 1, 1, 0},
{0, 1, 1, 1},
{1, 1, 0, 1},
{1, 0, 1, 0},
{2, 1, 0, 1},
};
for( int i = 0; i < test_row; i++){
for( int j = 0; j < test_col; j++){
test_set[i][j] = set[i][j];
}
}
}
void classify(const Node& root){
for(int i = 0; i < test_row; i++){
Node node = root;
while(!node.isLeafNode()){
// featureLabelId is col id
node = node.getChild(test_set[i][node.getFeatureLabelId()]);
} cout << node.getClassLabel() << endl;
}
}
int main(){
init_exam();
// cout << calcShannonEnt(dataSet) << endl; // test1
// printDataSet(splitDataSet(dataSet, 0, 1)); cout<< endl;
// printDataSet(splitDataSet(dataSet, 0, 0));//test2
// cout << chooseBestFeatureToSplit(dataSet) << endl;// test3
// createTree(dataSet, labelsId, -1).print(); // test4 -1是根节点
// start process
Node root = createTree(dataSet, labelsId, -1);
loadTestSet();
classify(root);
// end process
return 0;
}
void printDataSet(DataSet dataSet){
for (int i = 0; i < dataSet.row; i++) {
for (int j = 0; j < dataSet.col; j++) {
cout << dataSet.dataSet[i][j] << ' ';
} cout << endl;
} return;
}
double log2(double x){
return log(x) / log(2);
}
template <typename T>
void printVector(vector<T> v){
for(int i = 0; i < v.size(); i++){
cout << v[i] << ' ';
} cout << endl;
}
template <typename T>
String ConvertToString(T value) {
stringstream ss;
ss << value;
return ss.str();
}
double calcShannonEnt(DataSet dataSet){
int numEntries = dataSet.row;
map<int, int> labelCounts;
for (int i = 0; i < dataSet.row; i++) { // 计算当前的数据集有多少类标签
int currentLabel = dataSet.dataSet[i][dataSet.col - 1];
labelCounts[currentLabel]++;
}
double shannonEnt = 0.0; // 计算当前数据集的香农熵
for (map<int, int> ::iterator it = labelCounts.begin(); it != labelCounts.end(); it++) {
double prob = it->second / (double)numEntries;
shannonEnt -= prob * log2(prob);
}
return shannonEnt;
}
DataSet splitDataSet(DataSet dataSet, int axis, int value){
DataSet retDataSet; // retDataSet col and row are 0,
int retCol = dataSet.col - 1; // 去除axis轴上属性值为value的列,把剩余部分放入retDataSet
for (int i = 0; i < dataSet.row; i++){
if (dataSet.dataSet[i][axis] == value) {
VI v;
retDataSet.push_back(v); // add a new row
retDataSet.dataSet[retDataSet.row - 1].reserve(retCol);
retDataSet.dataSet[retDataSet.row - 1].resize(retCol);
VI::iterator itOri = (dataSet.dataSet[i]).begin();
VI::iterator itDes = (retDataSet.dataSet[retDataSet.row - 1]).begin();
copy(itOri, itOri + axis, itDes);
copy(itOri + axis + 1, itOri + dataSet.col, itDes + axis);
// get a reducedFeatVec
}
}
if (retDataSet.row > 0){ // 拆完之后存在retDataSet所以更新col
retDataSet.col = retCol;
}
return retDataSet;
}
int chooseBestFeatureToSplit(DataSet dataSet){
int numFeatures = dataSet.col - 1; // the last col is the feature we need to predict
double baseEntropy = calcShannonEnt(dataSet);
double bestInfoGainRatio = 0.0;
int bestFeature = -1;
for (int i = 0; i < numFeatures; i++) {
VI featList = dataSet.getCol(i);
Set<int> uniqueVals = Set<int>(featList);
double newEntropy = 0.0;
double SplitInfo = 0.0; // use for C4.5
for (set<int> ::iterator value = uniqueVals.begin(); value != uniqueVals.end(); value++) {
DataSet subDataSet = splitDataSet(dataSet, i, *value);
double prob = subDataSet.row / (double)(dataSet.row);
newEntropy += prob*calcShannonEnt(subDataSet);
SplitInfo -= prob*log2(prob);
}
double infoGain = baseEntropy - newEntropy;
cout << "SplitInfo: " << SplitInfo << endl;
double infoGainRatio = infoGain/SplitInfo;
// ---- 测试 ----
cout << "infoGainRatio " << infoGainRatio << endl;
// ---- 测试 ----
if (infoGainRatio > bestInfoGainRatio) {
bestInfoGainRatio = infoGainRatio;
bestFeature = i;
}
} return bestFeature;
}
int majorityCnt(VI classList) { // 计算类标签这一列,哪个类标签出现的最多
map<int, int> classCount;
int majorityLabel;
int maxClassCount = 0;
for (VI::iterator it = classList.begin(); it != classList.end(); it++) {
classCount[*it]++;
if (maxClassCount < classCount[*it]) {
maxClassCount = classCount[*it];
majorityLabel = *it;
}
} return majorityLabel;
}
/*
// labelsId 是特征值标签的下标
// cvalue是当前节点,针对父节点分类下的属性值
*/
Node createTree(DataSet dataSet, VI labelsId, int cvalue){
VI classList = dataSet.getCol(dataSet.col - 1);// 得到类标签那一列
Set<int> uniqueVals(classList);
if (uniqueVals.size() == 1) { // dataSet only class Label left
return Node(classList[0], cvalue); // what we will predict is left
}
if (dataSet.col == 1) { // no features left that we can choose, choose major class
return Node(majorityCnt(classList), cvalue);
}
int bestFeat = chooseBestFeatureToSplit(dataSet); // 实为找到bestFeature的Id
int bestFeatLabel = labelsId[bestFeat];
Node myTree(bestFeatLabel, cvalue); // 该节点代表的feature已经确定了,cvalue是针对父节点标签的属性值
labelsId.erase(labelsId.begin() + bestFeat);
VI subLabelsId = labelsId;
VI featValues = dataSet.getCol(bestFeat);
uniqueVals = Set<int>(featValues);
for (set<int> ::iterator value = uniqueVals.begin(); value != uniqueVals.end(); value++) {
// 根据该节点下的不同值来确定有多少个分支
myTree.addChild(createTree(splitDataSet(dataSet, bestFeat, *value),
subLabelsId, *value));
} return myTree;
}
C4.5
最新推荐文章于 2021-05-23 22:27:23 发布