数据来源
http://archive.ics.uci.edu/ml/machine-learning-databases/horse-colic
处理过程
由于该数据集存在30%的缺失,那么首先必须对数据集进行预处理,这里我把缺失值用每列的平均值来代替,同时把数据集没用的几列数据舍弃。之后利用sklearn库进行Logistic回归。
代码与数据已经打包上传,如果有需要请移步:
http://download.csdn.net/detail/qq_30091945/9822726
结果:
由于有30%的数据缺失,不可避免误差过大。
Python代码如下:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2017/4/23 0023 7:59
# @Author : Aries
# @Site :
# @File : 疝气症预测病马死亡率.py
# @Software: PyCharm Community Edition
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
def GetData(path):
"""
:param path: 数据集路径
:return: 返回数据集
"""
Data = []
Label = []
#没有用的属性的下标
index = [2,24,25,26,27]
with open(path) as f:
for line in f.readlines():
LineArr = line.strip().split(" ")
m = np.shape(LineArr)[0]
data = []
for i in range(m):
if i in index:
#没有用的属性直接跳过
continue
elif i == 22:
#下标为22的属性是分类
#1代表活着,标记设为1
#2,3分别代表死亡,安乐死,标记设为0
if LineArr[i] == '?':
Label.append(0)
elif int(LineArr[i]) == 1:
Label.append(1)
else:
Label.append(0)
else:
#剩下的是有用数据
if LineArr[i] == '?':
#缺失数据首先由0代替
data.append(0.0)
else:
data.append(float(LineArr[i]))
Data.append(data)
Data = np.array(Data)
Label = np.array(Label)
return Data,Label
def ZeroProcess(data):
"""
:param data:需要进行0值处理的数据
:return: 返回把0值已经处理好的数据
"""
m,n = np.shape(data)
for i in range(n):
avg = np.average(data[:,i])
if np.any(data[:,i]) == 0:
for j in range(m):
data[j][i] = avg
else:
continue
return data
def autoNorm(Data):
"""
:param Data: 需要进行归一化的数据
:return: 进行Max-Min标准化的数据
"""
#求出数据中每列的最大值,最小值,以及相应的范围
data_min = Data.min(0)
data_max = Data.max(0)
data_range = data_max-data_min
#进行归一化
m = np.shape(Data)[0]
Norm_Data = Data - np.tile(data_min,(m,1))
Norm_Data = Norm_Data / data_range
return Norm_Data
def PreProcess(data):
"""
数据预处理,包括0值处理和归一化
:param data:需要处理的数据
:return: 已经处理好的数据
"""
#对数据进行0值处理
Non_Zero_Data = ZeroProcess(data)
#对数据进行归一化
Norm_Data = autoNorm(Non_Zero_Data)
return Norm_Data
def run_main():
"""
这是主函数
"""
#导入训练与测试数据集
path1 = "./horse_colic_train.txt"
path2 = "./horse_colic_test.txt"
Train_Data,Train_Label = GetData(path1)
Test_Data,Test_Label = GetData(path2)
#数据预处理,包括0值处理和归一化
Train_Data_Precess = PreProcess(Train_Data)
Test_Data_Process = PreProcess(Test_Data)
#设置matplotlib,能让它显示中文
mpl.rcParams['font.sans-serif'] = [u'simHei']
mpl.rcParams['axes.unicode_minus'] = False
#训练Logistic回归算法
logistiic_regression = LogisticRegression()
model = logistiic_regression.fit(Train_Data_Precess,Train_Label)
print("Logistic回归的系数为:", logistiic_regression.coef_, "常数项为:", logistiic_regression.intercept_)
#对Logistic回归算法进行测试
Train_Label_Predict = logistiic_regression.predict(Train_Data_Precess)
error = 0.0
for i in range(len(Train_Label)):
if Train_Label[i] != Train_Label_Predict[i]:
error = error + 1.0
error = error / len(Train_Label)
avg = np.average(Train_Label)
TSS = np.sum((Train_Label-avg)**2)
RSS = np.sum((Train_Label_Predict-Train_Label)**2)
R2 = 1 - RSS/TSS
print("训练样本的误差为:",error)
print("R2为:",R2)
#测试新数据
Test_Label_Predict = logistiic_regression.predict(Test_Data_Process)
err = 0.0
for i in range(len(Test_Label_Predict)):
print(Test_Label[i],Test_Label_Predict[i])
if Test_Label_Predict[i] != Test_Label[i]:
err = err + 1.0
err = err / len(Test_Label)
print("误差为:",err)
if __name__ == "__main__":
run_main()