#使用逻辑回归预测病马的死亡率
import numpy as np
import pandas as pd
#训练集
train=pd.read_table('../MLinAction_source/horseColicTraining.txt',header=None)
#测试集
test=pd.read_table('../MLinAction_source/horseColicTest.txt',header=None)
def sigmoid(inx):
s=1/(1+np.exp(-inx))
return s
#logistic回归分类函数
'''
函数功能:给定测试数据集和权重,返回标签类别
参数shuom:
inx:测试数据
weights:特征数据
'''
def classify(inx,weights):
p=sigmoid(sum(inx*weights))
if p<0.5:
return 0
else:
return 1
def SGD_LR(dataSet,alpha=0.001,maxCycles=500):
dataSet=dataSet.sample(maxCycles,replace=True)
dataSet.index=range(dataSet.shape[0])
xMat=np.mat(dataSet.iloc[:,:-1].values)
yMat=np.mat(dataSet.iloc[:,-1].values).T
xMat=regularize(xMat)
m,n=xMat.shape
weights=np.zeros((n,1))
for i in range(m):
grad=xMat[i].T*(xMat[i]*weights-yMat[i])
weights=weights-alpha*grad
return weights
#标准化
def regularize(xMat):
inMat=xMat.copy()
#均值
inMeans=np.mean(inMat,axis=0) # axis=0 表示纵轴平均,axis=1表示横轴的平均
#标准差
inVar=np.std(inMat,axis=0)
inMat=(inMat-inMeans)/inVar #(当前值-均值)/标准差
return inMat
#logistic分类模型
'''
参数说明:
train:训练集
test:测试集
alpha:步长
maxCycles:最大迭代次数
返回:
retest:预测好标签的测试集
'''
def get_acc(train,test,alpha=0.001,maxCycles=5000):
weights=SGD_LR(train,alpha=alpha,maxCycles=maxCycles)
xMat=np.mat(test.iloc[:,:-1].values)
xMat=regularize(xMat)
result=[]
for inx in xMat:
label=classify(inx,weights)
result.append(label)
retest=test.copy()
retest['predict']=result
acc=(retest.iloc[:,-1]==retest.iloc[:,-2]).mean()
print(f'模型准确率:{acc}')
return retest
alpha=0.001
maxCycles=5000
get_acc(train,test,alpha,maxCycles)
#运行十次查看结果
for i in range(10):
acc=get_acc(train,test,alpha=0.001,maxCycles=5000)
horseColicTraining.txt:
链接
提取码:4to1
horseColicTest.txt:
链接
提取码:t3kb