文章目录
tree
src(放置源代码和样本)
main.cpp:主框架
#include<iostream>
#include<vector>
#include<map>
#include<omp.h>
#include "TreeFunction.h"
#include "Data.h"
using namespace std;
//定义全局变量
extern vector<vector<string> >samples;//训练集
extern vector<vector<string> >test_samples; //测试集
extern vector<string>features;//全部属性
extern vector<string>remain_features;//除了最后一个属性的其他属性
extern map<string,vector<string> >data;// 样本集-->全部属性:对应全部属性值
extern map<string,vector<string> >subdata;//所有属性:对应多种属性值
int main()
{
data_Input();
test_bench();
tree MyTree(features,remain_features,data,subdata);
MyTree.create_node();
double sum=0;
#pragma omp parallel for schedule(dynamic)
for(int i=0;i<test_samples.size();i++)
{
vector<string>arr;
int n=test_samples[i].size();
for(int j=0;j<n;j++)
{
arr.push_back(test_samples[i][j]);
}
if(MyTree.find(arr)==test_samples[i][n-1])
sum++;
}
cout<<"正确率:"<<1.0*sum/test_samples.size()*100<<"%"<<endl;
return 0;
}
Data.cpp:引入数据
①把训练数据集practice_data.txt引入名为samples的二维数组,再把samples分为4个部分(2个vector和2个map)
②把测试数据集test_data.txt引入名为test_samples的二维数组
#include<fstream>
#include<iostream>
#include<sstream>
#include<map>
#include<vector>
#include<set>
#include"Data.h"
using namespace std;
//定义全局变量
vector<vector<string> >samples;//训练集
vector<vector<string> >test_samples; //测试集
vector<string>features;//全部属性
vector<string>remain_features;//除了最后一个属性的其他属性
map<string,vector<string> >data;// 样本集-->全部属性:对应全部属性值
map<string,vector<string> >subdata;//所有属性:对应多种属性值
void data_Input()
{
ifstream inFile("训练集的绝对路径/tree/src/practice_data.txt",ios::in);
string linestr;
if(!inFile)
{
cout<<"文件读取失败!"<<endl;
return;
}
int k=1;
while(getline(inFile,linestr))
{
std::stringstream ss(linestr);
string str("");
vector<string>linearray;
while(getline(ss,str,',')){
if(k){
features.push_back(str);
continue;
}
linearray.push_back(str);
}
if(k){
k=0;
continue;
}
samples.push_back(linearray);
samples.push_back(linearray);
linearray.clear();//把指针指向第一个
}
inFile.close();
remain_features=features;
remain_features.pop_back();
for(int i=0;i<features.size();i++)
{
set<string>feature_num;
for(int j=0;j<samples.size();j++)
{
data[features[i]].push_back(samples[j][i]);
feature_num.insert(samples[j][i]);
}
for(set<string>::iterator it=feature_num.begin(); it!=feature_num.end();it++)
{
subdata[features[i]].push_back(*it);
}
}
}
void test_bench()
{
ifstream inFile("/home/linzy/tree/src/test_data.txt",ios::in);
string linestr;
if(!inFile)
{
cout<<"测试集文件读取失败!"<<endl;
return;
}
while(getline(inFile,linestr))
{
std::stringstream ss(linestr);
string str("");
vector<string>linearray;
while(getline(ss,str,',')){
linearray.push_back(str);
}
test_samples.push_back(linearray);
linearray.clear();//把指针指向第一个
}
inFile.close();
}
practice_data.txt:(800组)
sunny,mild,normal,True,YES
fog,cool,low,True,YES
rainy,cool,high,True,YES
storm,mild,normal,True,NO
...
test_data.txt(200组)
storm,hot,normal,False,YES
fog,hot,normal,True,NO
fog,hot,normal,True,YES
...
由于找不到合适的数据集,两组离散型数据集均为python随机产生,因此结果可能产生较大误差
TreeFunction.cpp:决策树功能实现
#include<cmath>
#include<map>
#include<vector>
#include<set>
#include<string>
#include<omp.h>
#include "TreeFunction.h"
using namespace std;
tree::tree(vector<string>features, vector<string>remain_features, map<string,vector<string> >data, map<string,vector<string> >subdata)
{
feature=features;
remain_feature=remain_features;
attribute_table=data;
attribute_type=subdata;
feature_num=feature.size();
}
//------------------------------------------------------------------------------
void tree::create_node()
{
//situation1:所有类别都相同
map<string,int>cate;
for(int i=0;i<attribute_table[feature[0]].size();i++)
{
vector<string> tmp=attribute_table[feature[feature_num-1]];
cate[tmp[i]]++;
}
if(cate.size()==1)
{
map<string,int>::iterator it=cate.begin();
attribute_node=it->first;
return;
}
//situation2:所有属性的所有属性值都相同或没有属性,找最多的yes或no
int issame=0;
for(int i=0;i<remain_feature.size();i++)
{
issame=1;
for(int j=0;j<attribute_table[remain_feature[i]].size();j++)
{
if(attribute_table[remain_feature[i]][j]!=attribute_table[remain_feature[i]][0])
{
issame=0;
break;
}
}
if(issame==0)break;
}
if(issame||remain_feature.empty())
{
attribute_node=majority(cate);
return;
}
#pragma omp parallel for schedule(dynamic)
//situation3:
attribute_node=bestattribute();
for(int i=0;i<attribute_type[attribute_node].size();i++)
{
string value=attribute_type[attribute_node][i];
map<string,vector<string> >subattribute_table;
for(int j=0;j<attribute_table[attribute_node].size();j++)
{
if(value==attribute_table[attribute_node][j])
{
for(int k=0;k<feature.size();k++)
{
subattribute_table[feature[k]].push_back(attribute_table[feature[k]][j]);
}
}
}
if(subattribute_table.empty())//可能存在错误!!!
{
tree *p=new tree(feature,remain_feature,subattribute_table,attribute_type);
string childNode_Attribute=majority(cate);
p->set_attribute(childNode_Attribute);
childnode[value]=p;
}
else
{
vector<string>child_remain_feature=remain_feature;
vector<string>::iterator it=child_remain_feature.begin();
for(;it!=child_remain_feature.end();it++)
{
if(*it==attribute_node)
{
break;
}
}
child_remain_feature.erase(it);
tree *p=new tree(feature,child_remain_feature,subattribute_table,attribute_type);
childnode[value] = p;
p->create_node();
}
}
}
//-------------------------------------------------------------------------------------
void tree::set_attribute(string attribute)
{
this->attribute_node=attribute;
}
//--------------------------------------------------------------------------------------
string tree::majority(map<string,int>cate)
{
map<string,int>::iterator it=cate.begin();
string attribute_node=it->first;
int max=it->second;
it++;
for(;it!=cate.end();it++)
{
if(max<it->second)
{
max=it->second;
attribute_node=it->first;
}
}
return attribute_node;
}
//---------------------------------------------------------------------------
string tree::bestattribute()
{
double base_entropy_value=base_entropy();
string best;
double zengyi=0.0;
for(int i=0;i<remain_feature.size();i++)
{
double entroy_value=entropy(remain_feature[i]);
if(base_entropy_value-entroy_value>zengyi)
{
best=remain_feature[i];
zengyi=base_entropy_value-entroy_value;
}
}
return best;
}
//--------------------------------------------------------------------------
double tree::base_entropy()
{
map<string,int>tf;
for(int i=0;i<attribute_table[feature[feature_num-1]].size();i++)
{
tf[attribute_table[feature[feature_num-1]][i]]++;
}
double base_entropy_value=0.0;
for(map<string,int>::iterator it=tf.begin();it!=tf.end();it++)
{
double p=1.0*(it->second)/attribute_table[feature[feature_num-1]].size();
base_entropy_value-=p*log2(p);
}
return base_entropy_value;
}
//--------------------------------------------------------------------------
double tree::entropy(string attribute)//传入outlook,和sunny、rainy...
{
//先切分数据集
//得到一个字典:sunny:yes、no、no.... ; rainy:yes、no、no....
vector<string>attribute_value=attribute_type[attribute];
map<string,vector<string> >arr;
#pragma omp parallel for schedule(static)
for(int i=0;i<attribute_table[attribute].size();i++)
{
for(int j=0;j<attribute_value.size();j++)
{
if(attribute_table[attribute][i]==attribute_value[j])
{
arr[attribute_value[j]].push_back(attribute_table[feature[feature_num-1]][i]);
break;
}
}
}
//得到一个字典:sunny:多少个yes多少个no ; rainy:多少个yes多少个no...
#pragma omp parallel for schedule(static)
map<string,vector<int> >judge;
for(int i=0;i<attribute_value.size();i++)
{
vector<int>value(attribute_type[feature[feature_num-1]].size());
for(int j=0;j<arr[attribute_value[i]].size();j++)
{
for(int z=0;z<attribute_type[feature[feature_num-1]].size();z++)
{
if(attribute_type[feature[feature_num-1]][z]==arr[attribute_value[i]][j])
{
value[z]++;
break;
}
}
}
judge.insert(pair<string, vector<int> >(attribute_value[i],value));
}
//开始计算信息熵
#pragma omp parallel for schedule(static)
double entropy_result=0.0;
for(int i=0;i<attribute_value.size();i++)
{
double entropy_result_solo=0.00;
for(int j=0;j<judge[attribute_value[i]].size();j++)
{
double p=1.0*judge[attribute_value[i]][j]/arr[attribute_value[i]].size();
if(p)entropy_result_solo-=p*log2(p);
}
entropy_result+=1.0*arr[attribute_value[i]].size()/attribute_table[attribute].size()*entropy_result_solo;
}
return entropy_result;
}
//----------------------------------------------------------------------------------------
string tree::find(vector<string>attributes)
{
if(childnode.size()!=0)
{
string attribute_value;
#pragma omp parallel for schedule(dynamic)
for(int i=0;i<attribute_type[attribute_node].size();i++)
{
for(int j=0;j<attributes.size();j++)
{
if(attributes[j]==attribute_type[attribute_node][i])
{
attribute_value=attributes[j];
break;
}
}
if(!attribute_value.empty())break;
}
tree *p=childnode[attribute_value];
return p->find(attributes);
}
else
{
return attribute_node;
}
}
CMakeLists.txt
set(SRC_LIST ./Data.cpp ./main.cpp ./TreeFunction.cpp)
include_directories (../include)
add_executable (main ${SRC_LIST})
set (EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/bin)
find_package(OpenMP REQUIRED)
if(OPENMP_FOUND)
message("OPENMP FOUND")
target_link_libraries(main ${OpenMP_LIBS})
endif()
include(放置头目录)
data.h
#ifndef _Data_H_
#define _Data_H_
void data_Input();//训练集
void test_bench();//测试集
#endif
TreeFunction.h
#ifndef _TreeFunction_H
#define _TreeFunction_H
using namespace std;
class tree
{
public:
tree(vector<string>feature, vector<string>remain_feature, map<string,vector<string> >data, map<string,vector<string> >subdata);//构造函数
void create_node(); //生成子节点
double entropy(string); //计算信息熵
string bestattribute(); //找出最大信息增益
string majority(map<string,int>cate);//求众数
double base_entropy();//求历史数据的信息熵
void set_attribute(string);//设置子叶节点属性
string find(vector<string>arributes);//预测函数
private:
vector<string>feature;//全部属性
vector<string>remain_feature;//除了最后一个属性的其他属性
map<string,vector<string> >attribute_table;// 样本集-->全部属性:对应全部属性值
map<string,vector<string> >attribute_type;//所有属性:对应多少种属性值
int feature_num;//属性个数
string attribute_node;//当前属性
map<string,tree*>childnode;//相当于存指针指向下一个节点
};
#endif
CMakeLists.txt
project(tree)
add_subdirectory(src)
SHELL.sh(自动化编译)
#!/bin/bash
cd build
cmake ..
make
cd ..
cd bin
./main
build(放置Cmake生成的对象文件)
bin(放置输出的elf文件)
最后,切换到tree目录下,执行命令 chmod 755 SHELL.sh
./SHELL.sh
结果为