123

typedef unsigned int  UINT;
typedef unsigned long ULONG;
typedef          char CHAR;
typedef unsigned char BOOL;
typedef double        REAL;
typedef          char STRING[20];

typedef struct node {
   UINT tag_idx; /* ID code for attribute */
   UINT dif_var_idx;//有几个不同的值
   struct node *parent; /* 该点上一代的地址 */
   struct node **child; /* 指向该点子代地址的指针的指针*/
} NODE;

typedef struct ne_struct {
    REAL ne;
    UINT status;
} NEGENTROPY;//negentropy负熵

typedef struct matrix {
   UINT width;
   UINT height;
   STRING **data;
} MATRIX;//数据

enum {
    INACTIVE = 0, //不起作用的
    ACTIVE   = 1,
    INVALID_TAG_IDX = 0XFFFF,//无效的属性
    INVALID_DIF_VAR_IDX = 0XFFFF//无效的属性的个数
};
#define LN_2 0.693147180559945309417
#define entropy(x) (x > 0 ? x * log(x) / LN_2 : 0.0)

=================================================================================================================

 

 

 


#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <float.h>
#include <limits.h>
#include <string.h>
#include <conio.h>
#include <time.h>
#include "SAMIR.h"

 

 

/*-------------------------------------------------------------------*/

NEGENTROPY negentropy ( UINT column, NODE *parent );
void print_tree ( NODE *node, UINT level);
void free_tree ( NODE  * );
NODE* ID3 ( NODE* parent, UINT state, UINT *used );
void err_exit ( CHAR* , UINT );
MATRIX *build_matrix ( );
void free_matrix ( );
void read_matrix ( CHAR * );
void file_size ( CHAR * );
CHAR **read_tags ( CHAR * );
void free_tags ();
UINT *Allocate_n_dif_vars ( );
UINT *Allocate_used ();
STRING **Allocate_dif_vars ();
void find_dif_vars (CHAR *file_name);
bool exsit(UINT n_dif_vars, UINT i, CHAR *ptr);
/*-------------------------------------------------------------------*/

MATRIX *matrix;  /* 读进整个数据 */

CHAR **tag_names;  /* 读进属性名 */

UINT target,       /* 决策是在哪一列,即type是在哪一列 */
     n_vars,       /* 数据有几列 */
     n_samples;    /* 数据有几行 */

UINT *n_dif_vars;  /* 各列有几个不同的值 */
STRING **dif_vars; /* 各列不同的值 */

UINT MAX_N_DIF_VARS = 10; /* 假设每列最大的不同值个数为10 */

UINT *used;        /* 在建树时用来判断哪些列被搜索过了 */


int main (int argv, char *argc[])
{
    freopen("output.txt", "w", stdout);
   
    NODE *node;
   
    CHAR data_file[13], tag_file[13];  /* Longest file name in DOS */
   
 
    UINT i;

    strcpy(data_file, "input_1.txt");
    strcpy(tag_file,  "input_2.txt");
   
    /* 找数据有多少行多少列,并保存到n_samples和n_vars */
    file_size (data_file);
   
    /* 分配空间给n_dir_vars, dir_vars, 以及used*/
    n_dif_vars = Allocate_n_dif_vars ( );
    dif_vars = Allocate_dif_vars ();
    used = Allocate_used ( );
   
    /* 找各列不同的值,并记录到dif_vars[][]里 */
    find_dif_vars (data_file);   

    /* 读进属性名 */
    tag_names = read_tags (tag_file);

    /* 分配空间给matrix */
    matrix = build_matrix ();

    /* 读进整个数据 */
    read_matrix (data_file);

    /* 分类目标是最后一列 */
    target = n_vars - 1;
   
    /* 初始化各列都还没有搜索过 */
    for (i=0; i<n_vars; i++)
        used[i] = 0;
   
    /* 返回决策树根节点给node */
    node = ID3 ( NULL, 0, used );
   
    /* 输出决策树 */
    print_tree(node, 0);
   
    /* 释放空间 */
    free_tags ( );
    free_matrix ( );
    free_tree (node);

    return 0;
}

/*-------------------------------------------------------------------*/

void print_type ( NODE *node )
{
    STRING **data;
    NODE *_local;
    UINT i;
    BOOL _match;
   
    data = matrix->data;
   
    for (i=0; i<n_samples; i++)
    {
        _match = true;
        _local = node;
        while (_local->parent != NULL)
        {
            /* If at the root node, all entries match*/

            if ( strcmp(data[i][_local->parent->tag_idx], dif_vars[_local->parent->tag_idx][_local->dif_var_idx]) != 0 ) {
                _match = false;
                break;
            }   
         
            _local = _local->parent;
        }
       
        if (_match == true)
        {
            printf("[%8s: %-4s]\n", "type", data[i][target]);
            break;
        }   
    }   
}   

/*-------------------------------------------------------------------*/

void free_tree ( NODE *node ) {
    UINT i;
   
    if (node == NULL)
        return;
    if (node->child == NULL)
    {  
        free ( node );
        return;
    }  
   
    for (i=0; i<n_dif_vars[node->tag_idx]; i++)
    {
        free ( node->child[i] );
    }
   
    free ( node );
}   

/*-------------------------------------------------------------------*/

void print_tree ( NODE *node , UINT level )
{
    UINT i, j, k;
   
    if (node == NULL)
    {
        printf("%d  000000000000\n", level);
        return;
    }   
    if (node->child == NULL)
    {
        if (node->parent == NULL)
            printf("[root]          \n");
        else
            printf("[%8s: %-4s]", tag_names[node->parent->tag_idx], dif_vars[node->parent->tag_idx][node->dif_var_idx]);
        print_type ( node );
        return;
    }
    else
    {  
        if (node->parent == NULL)
            printf("[root]          \n");
        else
            printf("[%8s: %-4s]", tag_names[node->parent->tag_idx], dif_vars[node->parent->tag_idx][node->dif_var_idx]);
   
        for (i=0; i<n_dif_vars[node->tag_idx]; i++)
        {
            if(i>0)
                for(j=0; j<level; j++)
                    for(k=0; k<16; k++)
                        printf(" ");
            print_tree ( node->child[i], level+1);
        }   
    }   
}
  
/*-------------------------------------------------------------------*/

NODE* ID3 ( NODE* parent, UINT state, UINT *used )

/* Routine to build a decision tree, based on Quinlan's ID3 algorithm. */
{
    NEGENTROPY negentropy_struct;
    NODE *node, *_parent;
    UINT n_vars = matrix->width, n_samples = matrix->height, i, j, split;
    STRING **data = matrix->data;
    REAL max_negentropy, _negentropy;
    UINT status;
   
    /* Allocate memory for this node */
    node = (NODE*) malloc (sizeof(NODE));
    if (!node)
        err_exit (__FILE__, __LINE__);
   
    node->parent = parent; /* Set address of parent node */
   
    if (parent == NULL)
        node->dif_var_idx = INVALID_DIF_VAR_IDX;
    else
        node->dif_var_idx = state;
 
    /*
     * Select attribute with lowest negentropy for splitting. Scan through
     * ALL attributes (except the target) and ALL data samples. This is
     * pretty inefficient for data sets with repeated values, but will do
     * for illustrative purposes
     */
   
    _parent = parent;
   
    status = INACTIVE;
    max_negentropy = 0.0;
    split = 0;

    for (i=0; i<n_vars; i++) /* 对每一列都进行搜索 */
    {
       
        if (i != target && !used[i]) /* 第i列不是决策目标那一列,且未被搜索过 */
        {
           
            negentropy_struct = negentropy (i, node); /* 计算第i列的熵值 */

            _negentropy = negentropy_struct.ne; /* 临时保存熵值到 _negentropy */

           
            if (_negentropy > max_negentropy) /* 与max_negentropy比较,保留最大的熵值,并把最大熵值的那一列记录到split,状态记录到status*/
            {
                max_negentropy = _negentropy;
                split = i;
                status = negentropy_struct.status;
            }
                               
        } /* if (i != target && !used[i]) */

    } /* for (i=0; i<n_vars; i++) */
   


    if (status == INACTIVE) /* 是叶子节点 */
    {
        node->tag_idx = INVALID_TAG_IDX; /* 不需要再继续做决策了 */
        node->child = NULL;
        return node;
    }   
   
    else { /* 不作为叶子节点 */
    
        node->tag_idx = split; /* 从split列继续做决策 */

        node->child = (NODE**) malloc (n_dif_vars[split] * sizeof(NODE*)); /* split这一列有种不同的值,就有多少个孩子 */
        if (!node->child)
            err_exit (__FILE__, __LINE__);
       
        for (i=0; i<n_dif_vars[split]; i++) /* 继续搜索孩子节点 */
        {
            used[split] = 1; /* split这列已经搜索过了,置为1 */
            node->child[i] = ID3 ( node, i, used );
            used[split] = 0; /* 搜索完了重新置为0 */
        }   

        return node;
    }   
}
 

/*-------------------------------------------------------------------*/

NEGENTROPY negentropy (
    UINT   column,
    NODE   *node)
{
    /*
     * Calculates the entropy of classification of an attribute, given
     * a table of attributes already used, the attribute on which splitting
     * is to be taken, and the target attribute. Entropy is calculated in
     * bits, so logs are taken to base 2 by dividing by LN_2.
     *
     * The returned value always lies in the (closed) range [0, 1].
     */

    NEGENTROPY ret_val;
    NODE *_local;
    UINT i, j, _match;
    REAL MES, M, B, p;
    STRING **data = matrix->data;
   
   
    UINT **count;
    UINT *tem_count;
    UINT *type_count;
    UINT total;
    UINT tem_row, tem_col;
   
    /* Allocate memory for this count */
    count = (UINT**) malloc (n_dif_vars[column] * sizeof(UINT*));
    if (!count)
        err_exit (__FILE__, __LINE__);
   
    for (i=0; i<n_dif_vars[column]; i++)
    {
        count[i] = (UINT*) malloc (n_dif_vars[target] * sizeof(UINT));
        if (!count[i])
            err_exit (__FILE__, __LINE__);
    }
   
    tem_count = (UINT*) malloc (n_dif_vars[column] * sizeof(UINT));
    if (!tem_count)
        err_exit (__FILE__, __LINE__);
       
    type_count = (UINT*) malloc (n_dif_vars[target] * sizeof(UINT));
    if (!type_count)
        err_exit (__FILE__, __LINE__);
   
    /* Initial count */
    for (i=0; i<n_dif_vars[column]; i++)
        for (j=0; j<n_dif_vars[target]; j++)
            count[i][j] = 0;
   
    for (i=0; i<n_dif_vars[column]; i++)
        tem_count[i] = 0;
       
    for (i=0; i<n_dif_vars[target]; i++)
        type_count[i] = 0;
   
    total = 0;

    /* 对每一行数据都进行扫描 */
    for (i=0; i<n_samples; i++)
    {

        _match = 1;
        _local = node;
        while (_local->parent != NULL)
        {
            /* If at the root node, all entries match*/

            if ( strcmp(data[i][_local->parent->tag_idx], dif_vars[_local->parent->tag_idx][_local->dif_var_idx]) != 0 ) {
                _match = 0;
                break;
            }   
         
            _local = _local->parent;
        }

        if (_match)
        {
            for (tem_row=0; tem_row<n_dif_vars[column]; tem_row++)
            {
                if ( strcmp( data[i][column], dif_vars[column][tem_row] ) == 0 )
                    break;
            }
           
            for (tem_col=0; tem_col<n_dif_vars[target]; tem_col++)
            {
                if ( strcmp( data[i][target], dif_vars[target][tem_col] ) == 0 )
                    break;
            }
           
            /* 一些计数, 为后面求各种各样的概率p用 */
            count[tem_row][tem_col]++;
            tem_count[tem_row]++;
            type_count[tem_col]++;
            total++;
           
        }
    }   /* for (i=0; i<n_samples; i++) */
   
    for (i=0; i<n_dif_vars[target]; i++)
    {
        p = (REAL) type_count[i] / (REAL) total;
        MES += -entropy (p);
    }
 

    if (total == 0) /* 如果都没有找到这样的,则作为叶子节点 */
    {
        ret_val.ne = 1.0;
        ret_val.status = INACTIVE;
        return ret_val;
    }
   
    for (i=0; i<n_dif_vars[target]; i++)
        if (type_count[i] == total)
            break;
    if (i < n_dif_vars[target]) /* 全属于一类就不用再分了,也作为叶子节点 */
    {
        ret_val.ne = 1.0;
        ret_val.status = INACTIVE;
        return ret_val;
    }
   
    /* 其他情况用计算熵的公式计算熵 */
    
    M = 0.0;
    B = 0.0;
    for (i=0; i<n_dif_vars[column]; i++)
    {
        if (tem_count[i] == 0) continue;
       
        M = 0.0;
        for (j=0; j<n_dif_vars[target]; j++)
        {
            p  = (REAL) count[i][j] / (REAL) tem_count[i];
            M += -entropy (p);
        }  
        p = (REAL) tem_count[i] / (REAL) total;
        B += p * M;
    }
   
    MES = 0.0;
    for (i=0; i<n_dif_vars[target]; i++)
    {
        p = (REAL) type_count[i] / (REAL) total;
        MES += -entropy (p);
    }   
   
    ret_val.ne = MES - B;
    ret_val.status = ACTIVE;

    return ret_val;
}

/*-------------------------------------------------------------------*/

void read_matrix (CHAR *filename)

{

    UINT i, j;
    UINT height = matrix->height;
    UINT width  = matrix->width;
    STRING **data = matrix->data;
    FILE *f;

    /* Open price file */
    if ((f = fopen(filename, "r")) == NULL)
    {
        printf("\n File not found : %s\n", filename);
        err_exit (__FILE__, __LINE__);
    }

    for (i=0; i<height; i++)
    {
        for (j=0; j<width; j++)
        {
            fscanf(f, "%s", &data[i][j] );
            printf("%-15s", data[i][j]);
        }
        printf("\n");
    }   

    fclose(f);

}

/*-------------------------------------------------------------------*/

MATRIX *build_matrix ( )
{
    MATRIX *_matrix;
    UINT i;

    _matrix = (MATRIX*) malloc (sizeof (MATRIX));
    if (!_matrix)
        err_exit (__FILE__, __LINE__);

    _matrix->width  = n_vars;
    _matrix->height = n_samples;

    _matrix->data = (STRING**) malloc (n_samples * sizeof (STRING*));
    if (_matrix->data == NULL)
        err_exit(__FILE__, __LINE__);

    for (i=0; i<n_samples; i++)
    {
        _matrix->data[i] = (STRING*) malloc (n_vars * sizeof(STRING));
        if (_matrix->data[i] == NULL)
            err_exit(__FILE__, __LINE__);
    }
    return _matrix;
}

/*-------------------------------------------------------------------*/

/*
* Standard error handler function
*/

void err_exit (CHAR* file, UINT line)
{
    printf("\n Fatal error in file %s, line %u", file, line);
    exit(0);
}

/*-------------------------------------------------------------------*/

void file_size (CHAR *file_name)
/*
* Given the name of a file of numeric data, this routine counts
* the numbers of rows and columns. It's assumed that the number
* of entries is the same in each row, and an error is flagged if this
* is not the case.
*
*/
{
    FILE *f;
    UINT buf_size = 0xFF, _width = 0;
    CHAR *buffer, *ptr;
    CHAR *delimiters = " ,\t\n";

    n_vars = n_samples = 0;

    buffer = (CHAR*) malloc (buf_size * sizeof (CHAR));
    if (buffer == NULL)
        err_exit (__FILE__, __LINE__);

    /* Open price file - abort if filename invalid */
    f = fopen(file_name, "r");
    if (f == NULL)
    {
        printf("\n File not found : %s\n", file_name);
        err_exit (__FILE__, __LINE__);
    }

    /* Get number of entries in first row */
    if (fgets(buffer, buf_size, f) != NULL)
    {
        ++n_samples;
        ptr = strtok (buffer, delimiters);
        while (ptr != NULL)
        {
            ++n_vars;
            ptr = strtok (NULL, delimiters);
        }
    }

    /* Count numbers of subsequent rows */
    while (!feof(f))
    {
        if (fgets(buffer, buf_size, f) != NULL)
        {
            if (strlen(buffer) > strlen("\n"))  /* if line is more than a NL char */
            {
                ++n_samples;
                _width = 0;
                ptr = strtok (buffer, delimiters);
                while (ptr != NULL)
                {
                    ++_width;
                    ptr = strtok (NULL, delimiters);
                }

                if (n_vars != _width)
                {
                    printf("\n Number of entries in file %s did not agree", file_name);
                    err_exit (__FILE__, __LINE__);
                }
            }
        }
    }
   
    free (buffer);
}

/*-------------------------------------------------------------------*/

void free_matrix ( )
{
    UINT i;
    for (i=0; i<matrix->height; i++)
        free (matrix->data[i]);

    free (matrix->data);
    free (matrix);
}

/*-------------------------------------------------------------------*/

void free_tags ( )
{
    UINT i;
    for (i=0; i<n_vars; i++)
        free(tag_names[i]);
    free (tag_names);
}

/*-------------------------------------------------------------------*/

CHAR **read_tags (CHAR *tag_file)
{
    FILE *f;
    CHAR **tem_tag_names;
    UINT i;
    CHAR buffer[0xFF];
    CHAR *ptr;
    CHAR *delimiters = " ,\t\n";

    f = fopen(tag_file, "r");
    if (f == NULL)
    {
        printf("\n File not found : %s\n", tag_file);
        err_exit (__FILE__, __LINE__);
    }

    tem_tag_names = (CHAR**) malloc (n_vars * sizeof (CHAR*));
    for (i=0; i<n_vars; i++)
        tem_tag_names[i] = (CHAR*) malloc (0xFF * sizeof (CHAR));

    i = 0;
    while (!feof(f))
    {   
        if (fgets(buffer, 0xFF, f) != NULL)
        {
            if (strlen(buffer) > strlen("\n"))
            {
                if (i > n_vars-1)
                {
                    printf("\nMore variable names were detected than data items.");
                    printf("\nPlease correct this problem before proceeding");
                    exit(0);
                }
                //sscanf (buffer, "%[a-zA-Z0-9-_;:!@#$%^&*(){}[]]", tem_tag_names[i]);
                //i++;

                ptr = strtok (buffer, delimiters);
                while (ptr != NULL)
                {  
                    printf("%-15s", ptr);
                    strcpy(tem_tag_names[i], ptr);
                    i++;
                    ptr = strtok (NULL, delimiters);
                }
               
            }
        }
       
    }

    if (i<n_vars)
    {
        printf("\nFewer variable names than data items were detected.");
        printf("\nPlease correct this problem before proceeding");
        exit(0);
    }
    printf("\n\n");

    fclose (f);

    return tem_tag_names;
   
}

/*-------------------------------------------------------------------*/

void find_dif_vars (CHAR *file_name)
{
    FILE *f;
    UINT buf_size = 0xFF, _width = 0;
    CHAR *buffer, *ptr;
    CHAR *delimiters = " ,\t\n";
    UINT i = 0;

    buffer = (CHAR*) malloc (buf_size * sizeof (CHAR));
    if (buffer == NULL)
        err_exit (__FILE__, __LINE__);

    /* Open price file - abort if filename invalid */
    f = fopen(file_name, "r");
    if (f == NULL)
    {
        printf("\n File not found : %s\n", file_name);
        err_exit (__FILE__, __LINE__);
    }
   
    /* Get number of entries in first row */
    if (fgets(buffer, buf_size, f) != NULL)
    {
        i = 0;
        ptr = strtok (buffer, delimiters);
        while (ptr != NULL)
        {
            n_dif_vars[i] = 0;
            if( sizeof(dif_vars[i][n_dif_vars[i]]) < strlen(ptr) )
                err_exit(__FILE__, __LINE__);

            strcpy(dif_vars[i][n_dif_vars[i]], ptr);
            n_dif_vars[i] = 1;
            i++;
           
            ptr = strtok (NULL, delimiters);
        }
    }

    /* Count numbers of subsequent rows */
    while (!feof(f))
    {
        if (fgets(buffer, buf_size, f) != NULL)
        {
            if (strlen(buffer) > strlen("\n"))  /* if line is more than a NL char */
            {
                i = 0;
                ptr = strtok (buffer, delimiters);
                while (ptr != NULL)
                {
                    if( sizeof(dif_vars[i][n_dif_vars[i]]) < strlen(ptr) )
                        err_exit(__FILE__, __LINE__);

                    if( exsit(i, n_dif_vars[i], ptr) == false )
                    {
                        strcpy(dif_vars[i][n_dif_vars[i]], ptr);
                        n_dif_vars[i]++;
                    } 
                    i++;
                   
                    ptr = strtok (NULL, delimiters);
                }
            }
        }
    }

    free (buffer);

}  

/*-------------------------------------------------------------------*/

UINT *Allocate_n_dif_vars () {
    UINT *tem_n_dif_vars;
    tem_n_dif_vars = (UINT*) malloc (n_vars * sizeof (UINT));
    if (tem_n_dif_vars == NULL)
        err_exit(__FILE__, __LINE__);
    return tem_n_dif_vars;

/*-------------------------------------------------------------------*/

UINT *Allocate_used () {
    UINT *tem_used;
    tem_used = (UINT*) malloc (n_vars * sizeof (UINT));
    if (tem_used == NULL)
        err_exit(__FILE__, __LINE__);
    return tem_used;

/*-------------------------------------------------------------------*/

STRING **Allocate_dif_vars () {
    STRING **tem_dif_vars;
    UINT i;
   
    tem_dif_vars = (STRING**) malloc (n_vars * sizeof (STRING*));
   
    if (tem_dif_vars == NULL)
        err_exit(__FILE__, __LINE__);
       
    for (i=0; i<n_vars; i++)
    {
        tem_dif_vars[i] = (STRING*) malloc (MAX_N_DIF_VARS * sizeof(STRING));
        if (tem_dif_vars[i] == NULL)
            err_exit(__FILE__, __LINE__);
    }
    return tem_dif_vars;
}
 
/*-------------------------------------------------------------------*/

bool exsit (UINT m, UINT n, CHAR *ptr)
{
    UINT j;
    for(j=0; j<n; j++)
    {
        if( strcmp(dif_vars[m][j], ptr) == 0 )
            return true;
    }
    return false;
}       
   

 

 

 

 

 

 

 

 

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值