前一篇文章说了如何用c语言读取CSV文件并保存为二维数组 前 一 篇 文 章 说 了 如 何 用 c 语 言 读 取 C S V 文 件 并 保 存 为 二 维 数 组 ,这一篇文章将会在将对得到二维数组根据某一个值进行划分操作
CSV文件
x1,x2,x3,x4,y
3.6216,8.6661,-2.8073,-0.44699,0
4.5459,8.1674,-2.4586,-1.4621,0
3.866,-2.6383,1.9242,0.10645,0
3.4566,9.5228,-4.0112,-3.5944,1
0.32924,-4.4552,4.5718,-0.9888,1
4.3684,9.6718,-3.9606,-3.1625,1
以 x2=8.1674 x 2 = 8.1674 为界限,把二维数组划分为两部分,对于二维数组中的每一行来说 x2 x 2 的值大于 8.1674 8.1674 的存储在一个二维数组里面,小于 8.1674 8.1674 的每一行存储在一个二维数组里面
思路很简单,就是利用循环来进行划分操作
先来一个Python版本的伪代码,比较直观:
def split(index, value, dataset):
left, right = list(), list()
for row in dataset:
if row[index] < value:
left.append(row)
else:
right.append(row)
return left, right
c语言里面没有 append() a p p e n d ( ) 操作,只能利用双重循环来进行赋值,并且我们要利用两个链表来记录所在行 x2 x 2 的值小于和大于 8.1674 8.1674 值所在行的行下标
完整的c语言代码
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
struct node{
int data;
struct node *next;
};//链表的定义
void get_two_dimension(char* line, double** data, char *filename);
void print_two_dimension(double** data, int row, int col);
int get_row(char *filename);
int get_col(char *filename);
int Gini(int** train_data, int* target);
void split_data(double **data, int index, int value, int row, int col);
int main()
{
char filename[] = "C:\\Users\\...\\...\\csvtest.csv";
char line[1024];
double **data;
int row, col;
int index = 1;
double value = 8.1674;
row = get_row(filename);
col = get_col(filename);
data = (double **)malloc(row * sizeof(int *));
for (int i = 0; i < row; ++i){
data[i] = (double *)malloc(col * sizeof(double));
}
get_two_dimension(line, data, filename);
printf("row = %d\n", row);
printf("col = %d\n", col);
split_data(data, index, value, row, col);
}
void get_two_dimension(char* line, double** data, char *filename)
{
FILE* stream = fopen(filename, "r");
int i = 0;
while (fgets(line, 1024, stream))
{
int j = 0;
char *tok;
char* tmp = strdup(line);
for (tok = strtok(line, ","); tok && *tok; j++, tok = strtok(NULL, ",\n")){
data[i][j] = atof(tok);
}
i++;
free(tmp);
}
fclose(stream);
}
void print_two_dimension(double** data, int row, int col)
{
int i, j;
for(i=1; i<row; i++){
for(j=0; j<col; j++){
printf("%f\t", data[i][j]);
}
printf("\n");
}
}
int get_row(char *filename)
{
char line[1024];
int i;
FILE* stream = fopen(filename, "r");
while(fgets(line, 1024, stream)){
i++;
}
fclose(stream);
return i;
}
int get_col(char *filename)
{
char line[1024];
int i = 0;
FILE* stream = fopen(filename, "r");
fgets(line, 1024, stream);
char* token = strtok(line, ",");
while(token){
token = strtok(NULL, ",");
i++;
}
fclose(stream);
return i;
}
void split_data(double **data, int index, int value, int row, int col)
{
node *left_node, *right_node;//left_node记录小于x2的行下标,right_node记录大于x2的行下标
left_node = (node *)malloc(sizeof(node));
right_node = (node *)malloc(sizeof(node));
left_node->next = NULL;
right_node->next = NULL;
int count_left_size=0, i, count_right_size=0, j, k;
for(i=1; i<row; i++){
if(data[i][index] < value)
count_left_size++;
else
count_right_size++;
}//计算两个二维数组的长度
printf("left size = %d\n", count_left_size);
printf("right size = %d\n", count_right_size);
double left_array[count_left_size][col], right_array[count_right_size][col];
for(i=1; i<row; i++){
if(data[i][index] < value){
node *p;
p = (node *)malloc(sizeof(node));
p->data = i;
p->next = left_node->next;
left_node->next = p;
}//对所在行x2的值小于8.1674的行下标进行链接
else{
node *q;
q = (node *)malloc(sizeof(node));
q->data = i;
q->next = right_node->next;
right_node->next = q;
}//对所在行x2的值大于8.1674的行下标进行链接
}
left_node = left_node->next;//链表的头部不存储行下标,往后移一位
right_node = right_node->next;//链表的头部不存储行下标,往后移一位
i = 0;
while(left_node){
for(j=0; j<col; j++){
left_array[i][j] = data[left_node->data][j];
}//赋值操作
left_node = left_node->next;
i++;
}
i = 0;
while(right_node){
for(j=0; j<col; j++){
right_array[i][j] = data[right_node->data][j];
}//赋值操作
right_node = right_node->next;
i++;
}
for(i=0; i<count_left_size; i++){
printf("I'm left\n");
for(j=0; j<col; j++){
printf("%f\t", left_array[i][j]);
}
printf("\n");
}
for(i=0; i<count_right_size; i++){
printf("I'm right\n");
for(j=0; j<col; j++){
printf("%f\t", right_array[i][j]);
}
printf("\n");
}
}
运行结果
row = 7
col = 5
left size = 2
right size = 4
I'm left
0.329240 -4.455200 4.571800 -0.988800 1.000000
I'm left
3.866000 -2.638300 1.924200 0.106450 0.000000
I'm right
4.368400 9.671800 -3.960600 -3.162500 1.000000
I'm right
3.456600 9.522800 -4.011200 -3.594400 1.000000
I'm right
4.545900 8.167400 -2.458600 -1.462100 0.000000
I'm right
3.621600 8.666100 -2.807300 -0.446990 0.000000
可以直观的感觉到Python代码是多么的简洁,不过在用c语言实现时用到了单链表,终于感觉单链表没白学了。