写了了个不怎样的ID3先放在这,有空回来修改(11.19):
(这个版本的data都是string类型的,没有做str2int的操作,感觉不太专业..)
#include <iostream>
#include <cmath>
#include <vector>
#include <set>
#include <map>
#include <string>
#include <sstream>
#include <algorithm>
#include <queue>
using namespace std;
#pragma region Data
typedef string String;
typedef String Label;
typedef vector<String> VS;
#pragma endregion Data
class DataSet{ // matrix built by vector
public:
DataSet(){
this->row = 0;
this->col = 0;
}
DataSet(int row, int col){
this->col = col;
this->row = row;
for( int i = 0; i < row; i++) {
VS vs;
dataSet.push_back(vs);
}
}
void push_back(VS vs) {
this->row++;
this->dataSet.push_back(vs);
}
VS getCol(int id) {
VS v;
for( int i = 0; i < this->row; i++) {
v.push_back(this->dataSet[i][id]);
}
return v;
}
int row;
int col;
vector<VS> dataSet;
};
class Node{
public:
Node(const Label& label, String value){
this->featureLabel = label;
this->value = value;
}
void push_back(const Node& node) {
child.push_back(node);
}
typename vector<Node> :: iterator begin(){
return child.begin();
}
typename vector<Node> :: iterator end(){
return child.end();
}
bool isLeafNode()const {
return child.empty();
}
void print(){
queue<Node> q;
q.push(*this);
while(!q.empty()){
Node tmpNode = q.front();//
q.pop();
if(tmpNode.isLeafNode())
continue;
cout << tmpNode.value << ":\"" << tmpNode.featureLabel <<"\"";
for ( vector<Node> :: iterator it = tmpNode.begin(); it != tmpNode.end(); it++) {
cout << " -> " << (*it).value << ":\""<< (*it).featureLabel << "\"";
q.push(*it);
} cout << endl;
}
}
private:
Label featureLabel;
String value;
vector<Node> child;
//class path{ };
};
template <class T>
class Set{
public:
Set(){}
Set(vector<T>& v){ // add key words typename
for(typename vector<T> :: iterator it = v.begin(); it != v.end(); it++) {
s.insert(*it);
}
}
typename set<T> :: iterator find(const T& item) {
return s.find(item);
}
void insert(const T& item) {
s.insert(item);
}
typename set<T> :: iterator begin(){
return s.begin();
}
typename set<T> :: iterator end(){
return s.end();
}
int size() const{
return s.size();
}
private:
set<T> s;
};
template <typename T>
String ConvertToString(T);
double log2(double);
void printDataSet (DataSet);
double calcShannonEnt(DataSet);
DataSet splitDataSet(DataSet dataSet, int axis, String value);
int chooseBestFeatureToSplit(DataSet d);
Label majorityCnt(VS classList) {
map<String, int> classCount;
Label majorityLabel;
int maxClassCount = 0;
for(VS :: iterator it = classList.begin(); it != classList.end(); it++) {
classCount[*it]++;
if(maxClassCount < classCount[*it]) {
maxClassCount = classCount[*it];
majorityLabel = *it;
}
} return majorityLabel;
}
Node createTree(DataSet dataSet, VS labels, String cvalue){
VS classList = dataSet.getCol(dataSet.col-1);
Set<String> uniqueVals(classList);
if (uniqueVals.size() == 1) { // only class Label
return Node(classList[0], cvalue); // that we will predict is left
}
if (dataSet.col == 1) { // no features left that we can choose
return Node(majorityCnt(classList), cvalue);
}
int bestFeat = chooseBestFeatureToSplit(dataSet);
Label bestFeatLabel = labels[bestFeat];
Node myTree(bestFeatLabel, cvalue); // 该节点代表的feature已经确定了
labels.erase(labels.begin()+bestFeat);
VS subLabels = labels;
VS featValues = dataSet.getCol(bestFeat);
uniqueVals = Set<String>(featValues);
for( set<String> :: iterator value = uniqueVals.begin(); value != uniqueVals.end(); value++) {
// 根据该节点下的不同值来确定有多少个分支
myTree.push_back( createTree( splitDataSet(dataSet, bestFeat, *value),
subLabels, *value) );
} return myTree;
}
const static int gRow = 39644, gCol = 116;
VS labels;
DataSet dataSet;
void init();
template <typename T>
void MyPrint(vector<T>& v) {
for(typename vector<T> :: iterator it = v.begin(); it != v.end(); it++ ) {
cout << *it << ' ';
} cout << endl;
}
int main(){
init();
cout << calcShannonEnt(dataSet) << endl; // test1
printDataSet(splitDataSet(dataSet, 0, "1")); cout<< endl;
printDataSet(splitDataSet(dataSet, 0, "0"));//test2
cout << chooseBestFeatureToSplit(dataSet) << endl;// test3
cout << "\ntest4:\n";
createTree(dataSet, labels, "Root Label").print(); // test4
return 0;
}
void init() {
dataSet.row = 5;
dataSet.col = 3;
string str1[] = {"1", "1", "yes"};
string str2[] = {"1", "1", "yes"};
string str3[] = {"1", "0", "no"};
string str4[] = {"0", "1", "no"};
String str5[] = {"0", "1", "no"};
String str[5][3] = {str1, str2, str3, str4, str5};
// String str[5][3] = {{"1", "1", "yes"},
// {"1", "1", "yes"},
// {"1", "0", "no"},
// {"0", "1", "no"},
// {"0", "1", "no"}};
for(int i = 0; i < 5; i++) {
VS vs;
for (int j = 0; j < 3; j++) {
vs.push_back(str[i][j]);
}
dataSet.dataSet.push_back(vs);
} //printData(dataSet); cout << endl;
labels.push_back("No surfacing");
labels.push_back("Flippers");
labels.push_back("Fish");
}
void printDataSet (DataSet dataSet){
for (int i = 0; i < dataSet.row; i++) {
for ( int j = 0; j < dataSet.col; j++) {
cout << dataSet.dataSet[i][j] << ' ';
} cout << endl;
} return;
}
double log2(double x){
return log(x)/log(2);
}
template <typename T>
String ConvertToString(T value) {
stringstream ss;
ss << value;
return ss.str();
}
double calcShannonEnt(DataSet dataSet){
int numEntries = dataSet.row;
map<String, int> labelCounts;
for ( int i = 0; i < dataSet.row; i++) {
String currentLabel = dataSet.dataSet[i][dataSet.col-1];
labelCounts[currentLabel]++;
}
double shannonEnt = 0.0;
for (map<String, int> :: iterator it = labelCounts.begin(); it != labelCounts.end(); it++) {
double prob = it->second / (double)numEntries;
shannonEnt -= prob * log2(prob);
}
return shannonEnt;
}
DataSet splitDataSet(DataSet dataSet, int axis, String value){
DataSet* retDataSet = new DataSet; // retDataSet col and row are 0,
int retCol = dataSet.col-1;
for (int i = 0; i < dataSet.row; i++){
if (dataSet.dataSet[i][axis] == value) {
VS vs;
retDataSet->push_back(vs); // add a new row
retDataSet->dataSet[retDataSet->row-1].reserve(retCol);
retDataSet->dataSet[retDataSet->row-1].resize(retCol);
VS :: iterator itOri = (dataSet.dataSet[i]).begin();
VS :: iterator itDes = (retDataSet->dataSet[retDataSet->row-1]).begin();
copy ( itOri, itOri+axis, itDes );
copy ( itOri+axis+1, itOri+dataSet.col, itDes+axis);
// get a reducedFeatVec
}
}
if(retDataSet->row > 0){
retDataSet->col = retCol;
}
return *retDataSet;
}
int chooseBestFeatureToSplit(DataSet dataSet){
int numFeatures = dataSet.col-1; // the last col is the feature we need to predict
double baseEntropy = calcShannonEnt(dataSet);
double bestInfoGain = 0.0, bestFeature = -1;
for (int i = 0; i < numFeatures; i++) {
VS featList = dataSet.getCol(i);
Set<String> uniqueVals = Set<String>(featList);
double newEntropy = 0.0;
for(set<string> :: iterator value = uniqueVals.begin(); value != uniqueVals.end(); value++) {
DataSet subDataSet = splitDataSet(dataSet, i, *value);
double prob = subDataSet.col/(double)(dataSet.col);
newEntropy += prob*calcShannonEnt(subDataSet);
}
double infoGain = baseEntropy - newEntropy;
if (infoGain > bestInfoGain) {
bestInfoGain = infoGain;
bestFeature = i;
}
} return bestFeature;
}
作死的换了下数据结构... (11.20)
另一个测试样例:
ID AUs (split by space) Emotion
train1 1 2 5 25 27 7
train2 1 2 25 27 7
train3 1 4 15 17 6
train4 1 2 5 12 25 27 7
train5 1 4 17 39 6
train6 1 15 17 6
train7 1 2 5 15 25 27 7
train8 1 2 5 25 26 7
train9 1 2 5 16 25 27 7
train10 1 2 4 15 17 6
train11 1 4 7 20 25 4
train12 4 7 9 25 3
#include <iostream>
#include <cmath>
#include <vector>
#include <set>
#include <map>
#include <string>
#include <sstream>
#include <algorithm>
#include <queue>
#include <fstream>
using namespace std;
#pragma region TypeName
typedef string String;
//typedef String Label;
typedef vector<String> VS;
typedef vector<int> VI;
#pragma endregion
#pragma region Global Variables
const static int gRow = 13, gCol = 40; // 39 + 1
VS labels;
VI labelsId;
map<int, string> id2labels;
map<int, string> id2class;
#pragma endregion
#pragma region ClassType
class DataSet{ // matrix built by vector
public:
DataSet(){
this->row = 0;
this->col = 0;
}
DataSet(int row, int col){
this->col = col;
this->row = row;
for (int i = 0; i < row; i++) {
VI v;
dataSet.push_back(v);
}
}
void push_back(VI v) {
this->row++;
this->dataSet.push_back(v);
}
VI getCol(int id) {
VI v;
for (int i = 0; i < this->row; i++) {
v.push_back(this->dataSet[i][id]);
}
return v;
}
int row;
int col;
vector<VI> dataSet;
} dataSet; // 全局创建了这一个对象
class Node{
public:
Node(int label, int value){
this->featureLabel = label;
this->featureValue = value;
}
void insert(const Node& node) { child.push_back(node); }
typename vector<Node> ::iterator begin(){ return child.begin(); }
typename vector<Node> ::iterator end(){ return child.end(); }
bool isLeafNode()const { return child.empty(); }
Node& getChild(int value){
for (vector<Node> ::iterator it = child.begin(); it != child.end(); it++) {
if ((*it).featureValue == value)
return *it;
}
}
//bool operator < (const Node& oNode) const{ return this->featureValue < oNode.featureValue; }
void print(){
queue<Node> q;
q.push(*this);
while (!q.empty()){
Node tmpNode = q.front();//
q.pop();
cout << tmpNode.featureValue << ":\"" << id2labels[tmpNode.featureLabel] << "\"";
for (vector<Node> ::iterator it = tmpNode.begin(); it != tmpNode.end(); it++) {
if( (*it).isLeafNode()) {
cout << " -> " << (*it).featureValue << ":\"" << id2class[(*it).featureLabel] << "\"";
} else {
cout << " -> " << (*it).featureValue << ":\"" << id2labels[(*it).featureLabel] << "\"";
q.push(*it);
}
} cout << endl;
}
}
private:
int featureLabel; // 特征标签的下标
int featureValue; // featureValue是针对父节点标签的分类值
vector<Node> child;
};
template <class T>
class Set{
public:
Set(){};
Set(vector<T>& v){ // add key words typename
for (typename vector<T> ::iterator it = v.begin(); it != v.end(); it++){
s.insert(*it);
}
}
typename set<T> ::iterator find(const T& item) { return s.find(item); }
void insert(const T& item) { s.insert(item); }
typename set<T> ::iterator begin(){ return s.begin(); }
typename set<T> ::iterator end(){ return s.end(); }
int size() const{ return s.size(); }
private:
set<T> s;
};
#pragma endregion
#pragma region FunctionPrototype
template <typename T>
String ConvertToString(T);
double log2(double);
void printDataSet(DataSet);
double calcShannonEnt(DataSet);
DataSet splitDataSet(DataSet dataSet, int axis, int value);
int chooseBestFeatureToSplit(DataSet d);
int majorityCnt(VI classList);
Node createTree(DataSet dataSet, VI labels, int cvalue);
#pragma endregion
void init(){ // 我想考虑到通用情况,就是每个特征有很多取值[0~n]
dataSet.col = gCol;
ifstream file("Dataset_txt format.txt");
String str[gRow];
for (int row = 0; !file.eof(); row++) {
getline(file, str[row], '\n'); // 读入一行
}
//for (int i = 0; i < gRow; i++) { cout << str[i] << endl; }
for (int i = 1; i < gRow; i++) {
stringstream ss;
int id;
String strId;
VI v(gCol);
ss << str[i];
//cout << str[i] << endl;
ss >> strId; // 去掉第一列,即为列号
while (!ss.eof()) { // !!
ss >> id;
if(!ss.eof()) { // 不是最后一列的数据
v[id-1] = 1; // 这组数据集的特征除了0就是1
//cout << "id:" << id << endl;
} else { // 是最后一列的数据,类标签
v[gCol-1] = id;
break;
}
}
dataSet.push_back(v);
}
for (int i = 0; i < gCol; i++) {
String label = "AU"+ConvertToString(i+1);
//cout << "label: " << label << endl;
labels.push_back(label);
id2labels[i] = label;
labelsId.push_back(i);
}
for ( int i = 0; i < 10; i++) {
id2class[i] = ConvertToString(i);
}
file.close();
cout << "dataSet:\n";
printDataSet(dataSet);
cout << "row: " << dataSet.row << "\ncol: " << dataSet.col << endl;
}
void init1() { // 自定义测试集
// String str[5][3] = {{"1", "1", "yes"},
// {"1", "1", "yes"},
// {"1", "0", "no"},
// {"0", "1", "no"},
// {"0", "1", "no"}};
dataSet.row = 5;
dataSet.col = 3;
int data[5][3] = { { 1, 1, 1 },
{ 1, 1, 1 },
{ 1, 0, 0 },
{ 0, 1, 0 },
{ 0, 1, 0 }};
for (int i = 0; i < 5; i++) {
size_t count=sizeof(data[i])/sizeof(int);
VI v(data[i], data[i]+count);
dataSet.dataSet.push_back(v);
} printDataSet(dataSet); cout << endl;
labels.push_back("No surfacing");
labelsId.push_back(0);
id2labels[0] = "No surfacing";
labels.push_back("Flippers");
labelsId.push_back(1);
id2labels[1]="Flippers";
labels.push_back("Fish");
labelsId.push_back(2);
id2labels[2] = "Fish";
id2class[1] = "yes";
id2class[0] = "no";
}
int main(){
init();
//cout << calcShannonEnt(dataSet) << endl; // test1
//printDataSet(splitDataSet(dataSet, 0, 1)); cout<< endl;
//printDataSet(splitDataSet(dataSet, 0, 0));//test2
//cout << chooseBestFeatureToSplit(dataSet) << endl;// test3
createTree(dataSet, labelsId, -1).print(); // test4 -1是根节点
return 0;
}
void printDataSet(DataSet dataSet){
for (int i = 0; i < dataSet.row; i++) {
for (int j = 0; j < dataSet.col; j++) {
cout << dataSet.dataSet[i][j] << ' ';
} cout << endl;
} return;
}
double log2(double x){
return log(x) / log(2);
}
template <typename T>
String ConvertToString(T value) {
stringstream ss;
ss << value;
return ss.str();
}
double calcShannonEnt(DataSet dataSet){
int numEntries = dataSet.row;
map<int, int> labelCounts;
for (int i = 0; i < dataSet.row; i++) {
int currentLabel = dataSet.dataSet[i][dataSet.col - 1];
labelCounts[currentLabel]++;
}
double shannonEnt = 0.0;
for (map<int, int> ::iterator it = labelCounts.begin(); it != labelCounts.end(); it++) {
double prob = it->second / (double)numEntries;
shannonEnt -= prob * log2(prob);
}
return shannonEnt;
}
DataSet splitDataSet(DataSet dataSet, int axis, int value){
DataSet retDataSet; // retDataSet col and row are 0,
int retCol = dataSet.col - 1;
for (int i = 0; i < dataSet.row; i++){
if (dataSet.dataSet[i][axis] == value) {
VI v;
retDataSet.push_back(v); // add a new row
retDataSet.dataSet[retDataSet.row - 1].reserve(retCol);
retDataSet.dataSet[retDataSet.row - 1].resize(retCol);
VI::iterator itOri = (dataSet.dataSet[i]).begin();
VI::iterator itDes = (retDataSet.dataSet[retDataSet.row - 1]).begin();
copy(itOri, itOri + axis, itDes);
copy(itOri + axis + 1, itOri + dataSet.col, itDes + axis);
// get a reducedFeatVec
}
}
if (retDataSet.row > 0){
retDataSet.col = retCol;
}
return retDataSet;
}
int chooseBestFeatureToSplit(DataSet dataSet){
int numFeatures = dataSet.col - 1; // the last col is the feature we need to predict
double baseEntropy = calcShannonEnt(dataSet);
double bestInfoGain = 0.0, bestFeature = -1;
for (int i = 0; i < numFeatures; i++) {
VI featList = dataSet.getCol(i);
Set<int> uniqueVals = Set<int>(featList);
double newEntropy = 0.0;
for (set<int> ::iterator value = uniqueVals.begin(); value != uniqueVals.end(); value++) {
DataSet subDataSet = splitDataSet(dataSet, i, *value);
double prob = subDataSet.col / (double)(dataSet.col);
newEntropy += prob*calcShannonEnt(subDataSet);
}
double infoGain = baseEntropy - newEntropy;
if (infoGain > bestInfoGain) {
bestInfoGain = infoGain;
bestFeature = i;
}
} return bestFeature;
}
int majorityCnt(VI classList) {
map<int, int> classCount;
int majorityLabel;
int maxClassCount = 0;
for (VI::iterator it = classList.begin(); it != classList.end(); it++) {
classCount[*it]++;
if (maxClassCount < classCount[*it]) {
maxClassCount = classCount[*it];
majorityLabel = *it;
}
} return majorityLabel;
}
Node createTree(DataSet dataSet, VI labels, int cvalue){ // labels 是特征值标签的下标
VI classList = dataSet.getCol(dataSet.col - 1);
Set<int> uniqueVals(classList);
if (uniqueVals.size() == 1) { // only class Label
return Node(classList[0], cvalue); // that we will predict is left
}
if (dataSet.col == 1) { // no features left that we can choose, choose major class
return Node(majorityCnt(classList), cvalue);
}
int bestFeat = chooseBestFeatureToSplit(dataSet);
int bestFeatLabel = labels[bestFeat];
Node myTree(bestFeatLabel, cvalue); // 该节点代表的feature已经确定了,cvalue是针对父节点标签的分类值
labels.erase(labels.begin() + bestFeat);
VI subLabels = labels;
VI featValues = dataSet.getCol(bestFeat);
uniqueVals = Set<int>(featValues);
for (set<int> ::iterator value = uniqueVals.begin(); value != uniqueVals.end(); value++) {
// 根据该节点下的不同值来确定有多少个分支
myTree.insert(createTree(splitDataSet(dataSet, bestFeat, *value),
subLabels, *value));
} return myTree;
}