id3 c语言,ID3 c语言版 附例子

网上很多ID3实现的代码,但都没有数据文件的例子,不管是matlab还是c语言版都没需要自己摸索设置例子.比较麻烦,刚搞清除c语言版的,matlab的还需要再研究研究.

这一个id3代码只能分析数值型的数据,只能进行二分类.

输入文件:

数据文件  .dat 是以逗号分割的数据保护,n行样本,m列属性

属性标签  .tag 表示属性的名称,属性个数必须个数与m相等,一行一个名称

主函数:

1.从.dat读取行数和列数,即样本个数n和属性个数m

2.从.tag文件中读取属性的名称,最后一个是分类的属性

3.申请n*m的矩阵空间

4.从.dat文件中,将数据读入矩阵

5.进行ID3分析

为每个样本的每个属性值,计算信息增量,记录下最小的,按这个属性和值来划分

关键是计算信息增量的函数,negentropy

6.输出决策树

proto.h

NEGENTROPY negentropy ( REAL **, UINT, NODE*, UINT );

void print_tree ( NODE* , CHAR** );

void free_tree ( NODE  * );

NODE* ID3 ( MATRIX * , NODE* , UINT , UINT );

void err_exit ( CHAR* , UINT );

MATRIX *build_matrix ( UINT, UINT );

void free_matrix ( MATRIX * );

void read_matrix ( CHAR *, MATRIX * );

void file_size ( CHAR * , UINT * , UINT * );

CHAR **read_tags ( CHAR * , UINT );

void free_tags ( CHAR **, UINT);

id3.h

typedef unsigned int  UINT;

typedef unsigned long ULONG;

typedef          char CHAR;

typedef unsigned char BOOL;

typedef double        REAL;

typedef struct node {

UINT idx; /* ID code for attribute */

REAL threshold; /* Numerical threshold for attribute test */

struct node *on; /* Address of 'on' node */

struct node *off; /* Address of 'off' node */

struct node *parent; /* Addess of parent node */

} NODE;

typedef struct ne_struct {

REAL ne;

UINT status;

} NEGENTROPY;

typedef struct matrix {

UINT width;

UINT height;

REAL **data;

} MATRIX;

enum UINT { INACTIVE, OFF, ON };

#define LN_2 0.693147180559945309417

#define entropy(x) (x > 0 ? x * log(x) / LN_2 : 0.0)

id3.c

/*

* FILE: id3.c

*

* Author: Andrew Colin

*

* DISCLAIMER: No liability is assumed by the author for any use made

* of this program.

*

* DISTRIBUTION: Any use may be made of this program, as long as the

* clear acknowledgment is made to the author in code and runtime

* executables

*/

#include #include #include #include #include #include #include #include

#include "id3.h"

#include "proto.h"

/*-------------------------------------------------------------------*/

MATRIX *build_matrix (UINT width, UINT height)

{

MATRIX *_matrix;

UINT i;

_matrix = (MATRIX*) malloc (sizeof (MATRIX));

if (!_matrix)

err_exit (__FILE__, __LINE__);

_matrix->width  = width;

_matrix->height = height;

_matrix->data = (REAL**) malloc (height * sizeof (REAL*));

if (_matrix->data == NULL)

err_exit(__FILE__, __LINE__);

for (i=0; i{

_matrix->data[i] = (REAL*) malloc (width * sizeof(REAL));

if (_matrix->data[i] == NULL)

err_exit(__FILE__, __LINE__);

}

return _matrix;

}

/*-------------------------------------------------------------------*/

/*

* Standard error handler function

*/

void err_exit (CHAR* file, UINT line)

{

printf("\n Fatal error in file %s, line %u", file, line);

exit(0);

}

/*-------------------------------------------------------------------*/

void file_size (CHAR *file_name, UINT *width, UINT *height)

/*

* Given the name of a file of numeric data, this routine counts

* the numbers of rows and columns. It's assumed that the number

* of entries is the same in each row, and an error is flagged if this

* is not the case.

*

*/

{

FILE *f;

UINT buf_size = 0xFF, _width = 0;

CHAR *buffer, *ptr;

*width = *height = 0;

buffer = (CHAR*) malloc (buf_size * sizeof (CHAR));

if (buffer == NULL)

err_exit (__FILE__, __LINE__);

/* Open price file - abort if filename invalid */

f = fopen(file_name, "r");

if (f == NULL)

{

printf("\n File not found : %s\n", file_name);

err_exit (__FILE__, __LINE__);

}

/* Get number of entries in first row */

if (fgets(buffer, buf_size, f) != NULL)

{

++*height;

ptr = strtok (buffer, " ,");

while (ptr != NULL)

{

++*width;

ptr = strtok (NULL, " ,");

}

}

/* Count numbers of subsequent rows */

while (!feof(f))

{

if (fgets(buffer, buf_size, f) != NULL)

{

if (strlen(buffer) > strlen("\n"))  /* if line is more than a NL char */

{

++*height;

_width = 0;

ptr = strtok (buffer, " ,");

while (ptr != NULL)

{

++_width;

ptr = strtok (NULL, " ,");

}

if (*width != _width)

{

printf("\n Number of entries in file %s did not agree", file_name);

err_exit (__FILE__, __LINE__);

}

}

}

}

free (buffer);

}

/*-------------------------------------------------------------------*/

void free_matrix (MATRIX *_matrix)

{

UINT i;

for (i=0; iheight; i++)

free (_matrix->data[i]);

free (_matrix->data);

free (_matrix);

}

/*-------------------------------------------------------------------*/

void free_tags ( CHAR** varname, UINT width)

{

UINT i;

for (i=0; ifree(varname[i]);

free (varname);

}

/*-------------------------------------------------------------------*/

void free_tree ( NODE  *node )

{

/*

*  Frees the memory allocated to a tree structure

*/

if (node == NULL)

return;

else

{

free_tree (node->on);

free_tree (node->off);

free(node);

}

}

/*-------------------------------------------------------------------*/

NODE* ID3 ( MATRIX *matrix, NODE* parent, UINT target, UINT state)

/* Routine to build a decision tree, based on Quinlan's ID3 algorithm. */

{

NEGENTROPY negentropy_struct;

NODE *node;

UINT n_vars = matrix->width, n_samples = matrix->height, i, j, split;

REAL **data = matrix->data;

REAL best_threshold, min_negentropy, _negentropy;

/* Allocate memory for this node */

node = (NODE*) malloc (sizeof(NODE));

if (!node)

err_exit (__FILE__, __LINE__);

/* Set up links in decision tree */

node->parent = parent;  /* Set address of parent node */

if (parent != NULL) /* parent to child; not relevant for root node */

{

/* Pass address of this node to the parent node */

if (state == ON)

parent->on = node;

else

if (state == OFF)

parent->off = node;

}

/*

* Select attribute with lowest negentropy for splitting. Scan through

* ALL attributes (except the target) and ALL data samples. This is

* pretty inefficient for data sets with repeated values, but will do

* for illustrative purposes

*/

min_negentropy = 1.0;

for (i=0; i{

for (j=0; j{

if (i != target)

{

/* Set trial values for this node... */

node->idx = i;

node->threshold = data[j][i];

/* ...and calculate the negentropy of this partition */

negentropy_struct = negentropy (data, n_samples, node, target);

_negentropy = negentropy_struct.ne;

/* If this negentropy is lower than any other, retain the

index and threshold for future use */

if (_negentropy < min_negentropy)

{

min_negentropy = _negentropy;

split = i;

best_threshold = data[j][i];

}

} /*if (i != target)*/

} /*for (j=0; j} /*for (i=0; i/* Save the combination of best attribute and threshold value */

node->idx = split;

node->threshold = best_threshold;

/*

* If the negentropy routine found itself at an end-of-branch

* for the decision tree, the 'status' flag in 'negentropy_struct'

* is set to ON or OFF and the node labelled accordingly. Otherwise,

* ID3 continues to call itself until all end-of-branch nodes are

* found.

*/

if  (negentropy_struct.status != INACTIVE)

{

node->on = node->off = NULL;

node->idx = negentropy_struct.status;

printf("neigentropy inactive\n");

}

else

{

node->on  = ID3 (matrix, node, target, ON);

node->off = ID3 (matrix, node, target, OFF);

}

printf("ID3 finish!\n");

return node;

}

/*-------------------------------------------------------------------*/

void main (int argv, char *argc[])

{

MATRIX *matrix;

NODE *node;

UINT target, n_vars, n_samples;

CHAR data_file[13], tag_file[13];  /* Longest file name in DOS */

CHAR **tag_names;

/* Set up file names */

if (argv != 2)

{

printf("\nUsage: id3 [datafile]");

exit(0);

}

else

{

printf("\nWelcome to ID3");

printf("\nLast compiled on %s, %s", __DATE__, __TIME__);

printf("\n");

strcpy(data_file, argc[1]);

strcpy(tag_file,  argc[1]);

strcat(data_file, ".dat");

strcat(tag_file,  ".tag");

}

/* Read dimensions of data file */

file_size (data_file, &n_vars, &n_samples);

/* Read labels for columns of data */

tag_names = read_tags (tag_file, n_vars);

/* Allocate storage for data... */

matrix = build_matrix (n_vars, n_samples);

/* ...and read it from disk */

read_matrix (data_file, matrix);

/* Classification target is last column */

target = n_vars - 1;

/* Return root of decision tree - ID3 continues to call itself

recursively */

node = ID3 ( matrix, NULL, target, 0 );

print_tree(node, tag_names);

printf("\n");

free_tags (tag_names, n_vars);

free_matrix(matrix);

free_tree (node);

}

/*-------------------------------------------------------------------*/

NEGENTROPY negentropy ( REAL **data,

UINT   n_samples,

NODE   *local,

UINT   target)

{

/*

* Calculates the entropy of classification of an attribute, given

* a table of attributes already used, the attribute on which splitting

* is to be taken, and the target attribute. Entropy is calculated in

* bits, so logs are taken to base 2 by dividing by LN_2.

*

* The returned value always lies in the (closed) range [0, 1].

*/

NEGENTROPY ret_val;

NODE *_node, *_parent;

UINT on_ctr, off_ctr, p1, p2, i, _match;

REAL p_on, p_off, negentropy_on, negentropy_off;

on_ctr = off_ctr = p1 = p2 = 0;

/* Scan through all supplied data samples */

for (i=0; i{

/*

* If pattern matches the current position in the decision tree,

* then use this vector. The match is determined by passing up

* the decision tree and checking whether 'data[idx] >= threshold'

* matches at each step, where idx and threshold are taken from

* each node in turn.

*/

_match = 1;

_node = local;

while (_node->parent != NULL)

{ /* If at the root node, all entries match*/

_parent = _node->parent;

if (_node == _parent->on)

{ /* if parent node is ON */

if (data[i][_parent->idx] < _parent->threshold)

_match = 0;

}

else

if (_node == _parent->off)

{ /* if parent node is OFF */

if (data[i][_parent->idx] >= _parent->threshold)

_match = 0;

}

_node = _parent;

}

if (_match)

{

if (data[i][local->idx] >= local->threshold)

{

on_ctr++;

if (data[i][target] >= 2)

p1++;

// printf(" %lf ",data[i][target]);

}

else

{

off_ctr++;

if (data[i][target] >= 2)

p2++;

}

}

}   /* for (i=0; i/* 1: Entropy of subtable with activation ON */

/*

* We now have the numbers of samples that match this part of the

* decision tree, and the number of samples for which the supplied

* condition are true. From these quantities we can find the negentropy of

* this partition.

*/

if (on_ctr > 0)

{

p_on  = (REAL) p1 / (REAL) on_ctr;

p_off = 1 - p_on;

negentropy_on = -entropy (p_on) - entropy (p_off);

}

else

negentropy_on = 0.0;

/* 2: Entropy of subtable with activation OFF */

if (off_ctr > 0)

{

p_on  = (REAL) p2 / (REAL) off_ctr;

p_off = 1 - p_on;

negentropy_off = -entropy (p_on) - entropy (p_off);

}

else

negentropy_off = 0.0;

printf("negentropy_on=%lf off=%lf on_ctr=%d off_ctr=%d p1=%d p2=%d\n",negentropy_on,negentropy_off,on_ctr,off_ctr,p1,p2);

ret_val.ne = (negentropy_on * on_ctr + negentropy_off * off_ctr);

ret_val.ne /= (on_ctr + off_ctr);

/*

* If all values examined were the same, set 'ret_val.status' to

* the target value since this will be an end-of-branch node

*/

if ((p1 == on_ctr) && (p2 == off_ctr))

ret_val.status = ON;

else if ((p1 == 0) && (p2 == 0))

ret_val.status = OFF;

else

ret_val.status = INACTIVE;

return ret_val;

}

/*-------------------------------------------------------------------*/

void print_tree (NODE *node, CHAR** tag_names)

{

/*

* Displays algebraic equivalent of the decision tree

*/

if (node->on == NULL)

{

if (node->idx == ON)

printf("ON");

else if (node->idx == OFF)

printf("OFF");

return;

}

else

printf("if { %s >= %1.2f then ", tag_names[node->idx], node->threshold);

print_tree ( node->on, tag_names );

printf(" else ");

print_tree ( node->off, tag_names );

printf( " } " );

}

/*-------------------------------------------------------------------*/

void read_matrix (CHAR filename[], MATRIX *matrix)

{

UINT i, j;

UINT height = matrix->height;

UINT width  = matrix->width;

REAL **data = matrix->data;

FILE *f;

/* Open price file */

if ((f = fopen(filename, "r")) == NULL)

{

printf("\n File not found : %s\n", filename);

err_exit (__FILE__, __LINE__);

}

for (i=0; ifor (j=0; j{

fscanf(f, "%lf,", &data[i][j] );

printf("%lf ",data[i][j]);

}

fclose(f);

printf("\nRead Matrix Finish!\n");

}

/*-------------------------------------------------------------------*/

CHAR **read_tags (CHAR *tag_file, UINT width)

{

FILE *f;

CHAR **_varname;

UINT i;

CHAR buffer[0xFF];

f = fopen(tag_file, "r");

if (f == NULL)

{

printf("\n File not found : %s\n", tag_file);

err_exit (__FILE__, __LINE__);

}

_varname = (CHAR**) malloc (width * sizeof (CHAR*));

for (i=0; i_varname[i] = (CHAR*) malloc (0xFF * sizeof (CHAR));

i = 0;

while (!feof(f))

{

if (fgets(buffer, 0xFF, f) != NULL)

{

if (strlen(buffer) > strlen("\n"))

{

if (i>width-1)

{

printf("\nMore variable names were detected than data items.%d",width);

printf("\nPlease correct this problem before proceeding");

exit(0);

}

sscanf (buffer, "%[a-zA-Z0-9-_;:!@#$%^&*(){}[]]", _varname[i]);

i++;

}

}

}

if (i{

printf("\nFewer variable names than data items were detected.%d",i);

printf("\nPlease correct this problem before proceeding");

exit(0);

}

fclose (f);

return _varname;

}

样本的例子参考了matlab的数据名为fisheriris,这是数据挖掘书上经典的鸢尾花种类的数据。把setosa改为1,versicolor改为2,virginica改为3。由于这个代码只能进行2分类,所以1为1类对应OFF,23为另外一类ON。

1.dat

5.1,3.5,1.4,0.2,1

4.9,3,1.4,0.2,1

4.7,3.2,1.3,0.2,1

4.6,3.1,1.5,0.2,1

5,3.6,1.4,0.2,1

5.4,3.9,1.7,0.4,1

4.6,3.4,1.4,0.3,1

5,3.4,1.5,0.2,1

4.4,2.9,1.4,0.2,1

4.9,3.1,1.5,0.1,1

5.4,3.7,1.5,0.2,1

4.8,3.4,1.6,0.2,1

4.8,3,1.4,0.1,1

4.3,3,1.1,0.1,1

5.8,4,1.2,0.2,1

5.7,4.4,1.5,0.4,1

5.4,3.9,1.3,0.4,1

5.1,3.5,1.4,0.3,1

5.7,3.8,1.7,0.3,1

5.1,3.8,1.5,0.3,1

5.4,3.4,1.7,0.2,1

5.1,3.7,1.5,0.4,1

4.6,3.6,1,0.2,1

5.1,3.3,1.7,0.5,1

4.8,3.4,1.9,0.2,1

5,3,1.6,0.2,1

5,3.4,1.6,0.4,1

5.2,3.5,1.5,0.2,1

5.2,3.4,1.4,0.2,1

4.7,3.2,1.6,0.2,1

4.8,3.1,1.6,0.2,1

5.4,3.4,1.5,0.4,1

5.2,4.1,1.5,0.1,1

5.5,4.2,1.4,0.2,1

4.9,3.1,1.5,0.2,1

5,3.2,1.2,0.2,1

5.5,3.5,1.3,0.2,1

4.9,3.6,1.4,0.1,1

4.4,3,1.3,0.2,1

5.1,3.4,1.5,0.2,1

5,3.5,1.3,0.3,1

4.5,2.3,1.3,0.3,1

4.4,3.2,1.3,0.2,1

5,3.5,1.6,0.6,1

5.1,3.8,1.9,0.4,1

4.8,3,1.4,0.3,1

5.1,3.8,1.6,0.2,1

4.6,3.2,1.4,0.2,1

5.3,3.7,1.5,0.2,1

5,3.3,1.4,0.2,1

7,3.2,4.7,1.4,2

6.4,3.2,4.5,1.5,2

6.9,3.1,4.9,1.5,2

5.5,2.3,4,1.3,2

6.5,2.8,4.6,1.5,2

5.7,2.8,4.5,1.3,2

6.3,3.3,4.7,1.6,2

4.9,2.4,3.3,1,2

6.6,2.9,4.6,1.3,2

5.2,2.7,3.9,1.4,2

5,2,3.5,1,2

5.9,3,4.2,1.5,2

6,2.2,4,1,2

6.1,2.9,4.7,1.4,2

5.6,2.9,3.6,1.3,2

6.7,3.1,4.4,1.4,2

5.6,3,4.5,1.5,2

5.8,2.7,4.1,1,2

6.2,2.2,4.5,1.5,2

5.6,2.5,3.9,1.1,2

5.9,3.2,4.8,1.8,2

6.1,2.8,4,1.3,2

6.3,2.5,4.9,1.5,2

6.1,2.8,4.7,1.2,2

6.4,2.9,4.3,1.3,2

6.6,3,4.4,1.4,2

6.8,2.8,4.8,1.4,2

6.7,3,5,1.7,2

6,2.9,4.5,1.5,2

5.7,2.6,3.5,1,2

5.5,2.4,3.8,1.1,2

5.5,2.4,3.7,1,2

5.8,2.7,3.9,1.2,2

6,2.7,5.1,1.6,2

5.4,3,4.5,1.5,2

6,3.4,4.5,1.6,2

6.7,3.1,4.7,1.5,2

6.3,2.3,4.4,1.3,2

5.6,3,4.1,1.3,2

5.5,2.5,4,1.3,2

5.5,2.6,4.4,1.2,2

6.1,3,4.6,1.4,2

5.8,2.6,4,1.2,2

5,2.3,3.3,1,2

5.6,2.7,4.2,1.3,2

5.7,3,4.2,1.2,2

5.7,2.9,4.2,1.3,2

6.2,2.9,4.3,1.3,2

5.1,2.5,3,1.1,2

5.7,2.8,4.1,1.3,2

6.3,3.3,6,2.5,3

5.8,2.7,5.1,1.9,3

7.1,3,5.9,2.1,3

6.3,2.9,5.6,1.8,3

6.5,3,5.8,2.2,3

7.6,3,6.6,2.1,3

4.9,2.5,4.5,1.7,3

7.3,2.9,6.3,1.8,3

6.7,2.5,5.8,1.8,3

7.2,3.6,6.1,2.5,3

6.5,3.2,5.1,2,3

6.4,2.7,5.3,1.9,3

6.8,3,5.5,2.1,3

5.7,2.5,5,2,3

5.8,2.8,5.1,2.4,3

6.4,3.2,5.3,2.3,3

6.5,3,5.5,1.8,3

7.7,3.8,6.7,2.2,3

7.7,2.6,6.9,2.3,3

6,2.2,5,1.5,3

6.9,3.2,5.7,2.3,3

5.6,2.8,4.9,2,3

7.7,2.8,6.7,2,3

6.3,2.7,4.9,1.8,3

6.7,3.3,5.7,2.1,3

7.2,3.2,6,1.8,3

6.2,2.8,4.8,1.8,3

6.1,3,4.9,1.8,3

6.4,2.8,5.6,2.1,3

7.2,3,5.8,1.6,3

7.4,2.8,6.1,1.9,3

7.9,3.8,6.4,2,3

6.4,2.8,5.6,2.2,3

6.3,2.8,5.1,1.5,3

6.1,2.6,5.6,1.4,3

7.7,3,6.1,2.3,3

6.3,3.4,5.6,2.4,3

6.4,3.1,5.5,1.8,3

6,3,4.8,1.8,3

6.9,3.1,5.4,2.1,3

6.7,3.1,5.6,2.4,3

6.9,3.1,5.1,2.3,3

5.8,2.7,5.1,1.9,3

6.8,3.2,5.9,2.3,3

6.7,3.3,5.7,2.5,3

6.7,3,5.2,2.3,3

6.3,2.5,5,1.9,3

6.5,3,5.2,2,3

6.2,3.4,5.4,2.3,3

5.9,3,5.1,1.8,3

1.tag

height

weight

lh

lw

species

编译后,这样执行:

id3.exe 1

去掉了一些帮助信息后,结果主要为:

Welcome to ID3

Last compiled on Jul 30 2011, 21:11:55

Read Matrix Finish!

neigentropy inactive

ID3 finish!

ID3 finish!

if { lh >= 3.00 then ON else OFF }

阅读(1131) | 评论(1) | 转发(0) |

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值