机器学习之Logistic回归（c语言实现）

最新推荐文章于 2024-07-27 10:38:25 发布

u200812705

最新推荐文章于 2024-07-27 10:38:25 发布

阅读量6.4k

点赞数 6

分类专栏：机器学习文章标签：机器学习 c语言

本文链接：https://blog.csdn.net/u200812705/article/details/44595971

版权

机器学习专栏收录该内容

3 篇文章 0 订阅

订阅专栏

本文主要采用Logistic回归实现数据的分类

数据来源：UCI数据库 wpbc.data

可以参考相关资源进行学习：

http://www.cnblogs.com/jerrylead/archive/2011/03/05/1971867.html

Logistic回归主要针对输入的数据是连续的变量，输出则是有限的数值型。

涉及到以下方面：

1. 输出y = w0+w1*x1+w2*x2+..... (x1,x2,...是样本的属性值，为连续型的变量，w0,w1,w2,...为所要求的参数，y为有限的数值型变量，表示样本所属类别）。

2. logistic模型： 1/（1+exp(-z))，其中z= w0+w1*x1+w2*x2+..... 。

3.算法实现

w初始化为1;

alph = 0.1; //设置步长，需根据情况逐步调整

i = 0;

while( i<样本数量）

zi = w0+w1*xi1+w2*xi2+.....

h = 1/（1+exp(-zi));

error = yi-h;

while(...)

wj = wj+alph *error*xij; // j表示第j个属性

end

以上算法过程在样本量比较小的时候可以实现，在样本量非常大的时候，需要考虑采用随机梯度下降法，即随机从总的样本的选出小的样本集来用于迭代过程（可以百度相关资料）。

本文主要采用了梯度下降法完成了参数值优化过程。以下程序主要将3中算法实现。主要包含main.h 和 main.cpp两个文件

测试结果发现预测的准确率可以到80%左右。但感觉这和参数的调整有很大关系，样本量还是太小（总样本量198，训练集：150，测试集：48），这里比较简便，不包含校准数据集，另外结果存在一些欠拟合的现象。

main.cpp，具体用例

/*************
Logistic Regression( logistic 回归 ）using newton gradient descent

the Data:from UCI datalib named "wpbc.data"(that is about cancer )

CopyRight 2015/3/24 owner by pengjie(彭杰）
All Rights Reserved

**************/

#include "main.h"

int main()
{
	char *file = "C:\\Users\\Administrator\\Desktop\\machine_learnning\\wpbc.data";
	DataSample *data = new DataSample[sampleNum];
	double *logisW = new double[attriNum+1];
	
	if( -1!=ReadData( data,file ) )
	{
		Logistic( data,logisW );
	}

	for(int i=0;i<(attriNum+1);++i)
	{
		printf("%f\t",logisW[i]);
	}
	printf("\n\n");

	int correct = 0;
	int sum = 0;
	for(int i=trainNum;i<sampleNum; ++i)
	{
		++sum;
		bool eva = Predict(data[i],logisW);
		if(eva)
			++correct;
	}

	double rp = double(correct)/sum;
	printf("the right correction: %f\n",rp);

	delete []data;
	delete []logisW;

	return 0;
}

main.h完成所有功能实现

/*************
Logistic Regression( logistic 回归 ）using newton gradient descent

CopyRight 2015/3/24 owner by pengjie(彭杰）
All Rights Reserved

**************/

#ifndef MAIN_H
#define MAIN_H

#include "stdio.h"
#include "stdlib.h"
#include "iostream"
#include "string"
#include "string.h"
#include <sstream>
#include <memory.h>

#include "math.h"

using namespace std;

#define maxClassLabelNum 10;
int curLabelNum = 0;


const double alph = 0.3; //set  the newton gradient algorithm fixed step
const int attriNum = 33;
const int sampleNum = 198;
int trainNum = 140;

struct DataSample
{
	double attriValue[attriNum];
	bool classLabel;
};

double StringTodouble(char * src)
{
	double a;
	stringstream str;
	str<<src;
	str>>a;
	str.clear();
	return a;
}



int ReadData( DataSample* data, char *file)
{
	FILE *pFile;
	char buf[1024];
	pFile = fopen(file,"rt");
	if(pFile==NULL)
	{
		printf("the data file is not existing： %s\n", file);
		return -1;
	}

	int row = 0;    //data line
	int cloumn = 0; //data attribute
	char delim[] = ",";//data delimiter
	char *tmpdata = NULL;//data cache
	
	while(!feof(pFile)&&row<sampleNum)
	{
		buf[0] = '\0';
		fgets(buf,1024,pFile);

		if( buf[strlen(buf)-1]=='\n' )
		{
			buf[strlen(buf)-1]='\0';
		}

		//the first column is non-used,and second column is class label;
		for( int column = 0;column<(attriNum+2);++column )
		{
			if( column==0 )
			{
				tmpdata = strtok(buf,delim);
				continue;
			}
			else if( column==1 )
			{
				tmpdata = strtok(NULL,delim);

				
				if( tmpdata[0]=='R' )
					data[row].classLabel = 1; //R:1;  N:0
				else
					data[row].classLabel = 0;

			}
			else
			{
				tmpdata = strtok(NULL,delim);

				if(tmpdata[0]!='?')// '?' mean the loss attribute value
					data[row].attriValue[column-2] = StringTodouble(tmpdata);
				else
					data[row].attriValue[column-2] = -1000;
			}
		}
		++row;

	}

	return 1;
}

</pre><pre name="code" class="cpp">以上相关代码和使用的数据（来自UCI数据库）可以在我的资源里面下载：

<span style="color:#ff0000;">http://download.csdn.net/detail/u200812705/8520817</span>

u200812705

关注

6
点赞
踩
17

收藏

觉得还不错? 一键收藏
1
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录