逻辑回归糖尿病数据预测实战
逻辑回归
import pandas as pd
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']#显示中文标签
plt.rcParams['axes.unicode_minus']=False#正常显示负号
import warnings
#生成200条二分类数据(2个特征)
from sklearn.datasets import make_blobs
X,y=make_blobs(n_samples=200,n_features=2,centers=2,random_state=8)
print(X)
[[ 6.75445054 9.74531933]
[ 6.80526026 -0.2909292 ]
[ 7.07978644 7.81427747]
[ 6.87472003 -0.16069949]
[ 8.06164078 8.43736968]
[ 7.4934131 11.00892356]
[ 4.69777002 0.59687317]
[ 9.19642422 11.57536954]
[ 8.80996213 11.9021701 ]
[ 7.5952749 1.32739544]
[ 8.20330317 1.27929111]
[ 8.59258191 -0.29022607]
[ 6.89228905 8.60634293]]
…
(数据量较大,这里只显示部分)
print(y)
[0 1 0 1 0 0 1 0 0 1 1 1 0 0 1 1 0 0 1 0 0 1 1 0 1 0 1 1 1 0 1 1 0 1 1 1 1
1 1 0 0 1 1 0 1 0 0 0 1 0 1 1 0 1 1 1 0 1 0 0 1 0 1 0 1 0 1 0 0 0 0 0 1 0
0 0 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 0 0 0 0 1 1 1 0 0 0 1 1 1 1 1 1 1 0
1 1 1 0 0 1 1 0 1 1 0 0 0 1 1 0 0 1 1 0 1 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 1
0 1 1 1 0 0 0 1 0 0 0 1 1 1 1 1 0 0 0 1 0 0 0 1 1 1 1 1 0 0 0 0 1 0 0 0 1
1 1 0 1 0 1 1 0 1 0 0 0 0 0 1]
#数据可视化
plt.scatter(X[:,0],X[:,1],c=y,cmap=plt.cm.spring,edgecolors='k')
<matplotlib.collections.PathCollection at 0x1df2b358da0>
plt.scatter(X[:,0],X[:,1],c=y)
<matplotlib.collections.PathCollection at 0x1df2b422be0>
梯度下降法实现逻辑回归
#添加全1列
x_ones = np.ones((X.shape[0],1))
X=np.hstack((X,x_ones))
#拆分数据
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=8)
#查看数据维度
print(X.shape,X_train.shape,X_test.shape)
(200, 3) (140, 3) (60, 3)
print(y.shape,y_train.shape,y_test.shape)
(200,) (140,) (60,)
#将因变量转为列向量
y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)
print(y_train.shape,y_test.shape)
(140, 1) (60, 1)
#初始化theta值
theta=np.ones([X_train.shape[1],1])
print(theta)
[[1.]
[1.]
[1.]]
#设置步长值
alpha =0.001
#定义sigmoid函数
def sigmoid(z):
s = 1.0 / (1 + np.exp(-z))
return s
num_iters=10000
m=200
for i in range(num_iters):
h=sigmoid(np.dot(X_train,theta))
theta =theta - alpha*np.dot(X_train.T,(h-y_train))/m
print(theta)
[[ 0.58227542]
[-1.07778338]
[ 0.96527978]]
#预测
pred_y = sigmoid(np.dot(X_test,theta))
#预测结果二值化
pred_y[pred_y>0.5] = 1
pred_y[pred_y<=0.5] = 0
print(pred_y.reshape(1,-1))
[[0. 0. 1. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0.
1. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0.
0. 1. 1. 0. 0. 0. 1. 1. 0. 1. 1. 0.]]
print(y_test.reshape(1,-1))
[[0 0 1 1 0 0 1 1 0 1 0 0 0 1 1 1 1 1 0 1 1 0 0 0 1 0 0 1 0 1 0 1 1 0 0 0
1 0 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 1 0 1 1 0]]
print("预测准确率为:",np.sum(pred_y == y_test)/len(y_test))
预测准确率为: 1.0
糖尿病预测实战
#导入数据
data = np.loadtxt(r"E:\大二下\机器学习实践\pima-indians-diabetes.data.csv",delimiter=",",skiprows=1,dtype=np.float)
#分离特征变量和分类变量
X=data[:,:-1]
y=data[:,-1]
#特征标准化
mu = X.mean(axis=0)
std=X.std(axis=0)
X=(X-mu)/std
#添加全1列
x_ones = np.ones((X.shape[0],1))
X=np.hstack((X,x_ones))
#拆分训练集和测试集
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=8)
#将因变量转为列向量
y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)
print(y_train.shape,y_test.shape)
(537, 1) (231, 1)
#初始化theta值
theta=np.ones([X_train.shape[1],1])
#设置步长值
alpha=0.001
#定义sigmoid函数
def sigmoid(z):
s = 1.0 / (1 + np.exp(-z))
return s
num_iters=10000
m=200
for i in range(num_iters):
h=sigmoid(np.dot(X_train,theta))
theta =theta - alpha*np.dot(X_train.T,(h-y_train))/m
print(theta)
[[ 0.39210287]
[ 1.10657783]
[-0.24092243]
[ 0.0223229 ]
[-0.17137676]
[ 0.61819121]
[ 0.45880179]
[ 0.12971106]
[-0.84498429]]
#预测
pred_y = sigmoid(np.dot(X_test,theta))
#预测结果二值化
pred_y[pred_y>0.5] = 1
pred_y[pred_y<=0.5] = 0
print(pred_y.reshape(1,-1))
[[0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1.
0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0.
0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0.
0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 1. 0. 1.
0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.
1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0.
0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0.
0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 0. 0.]]
print(y_test.reshape(1,-1))
[[0. 1. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1.
0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0.
0. 1. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1.
0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0.
0. 0. 1. 1. 0. 0. 0. 1. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.
1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 1. 1.
0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 1. 0.
1. 1. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 1. 1.
1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0.
0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1.]]
print("预测准确率为:",np.sum(pred_y == y_test)/len(y_test))
预测准确率为: 0.7878787878787878
sklearn实现逻辑回归
Kaggle糖尿病预测
import numpy as np
import pandas as pd
#导入数据
data = np.loadtxt(r"E:\大二下\机器学习实践\pima-indians-diabetes.data.csv",delimiter=",",skiprows=1,dtype=np.float)
data
array([[ 6. , 148. , 72. , ..., 0.627, 50. , 1. ],
[ 1. , 85. , 66. , ..., 0.351, 31. , 0. ],
[ 8. , 183. , 64. , ..., 0.672, 32. , 1. ],
...,
[ 5. , 121. , 72. , ..., 0.245, 30. , 0. ],
[ 1. , 126. , 60. , ..., 0.349, 47. , 1. ],
[ 1. , 93. , 70. , ..., 0.315, 23. , 0. ]])
#分离特征变量和分类变量
X=data[:,:-1]
y=data[:,-1]
#特征标准化
mu =X.mean(axis=0)
std = X.std(axis=0)
X=(X-mu)/std
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=8)
#导入逻辑回归库
from sklearn.linear_model import LogisticRegression
#模型实例化
logist = LogisticRegression()
#模型训练
logist.fit(X_train,y_train)
D:\anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None, penalty='l2',
random_state=None, solver='warn', tol=0.0001, verbose=0,
warm_start=False)
#模型预测
y_predict = logist.predict(X_test)
print(y_predict)
[0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1.
0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0.
0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0.
0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 1. 0. 1.
0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0.
0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0.
0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 0. 0.]
#计算模型准确率
print("预测准确率为:",np.sum((y_predict == y_test))/len(y_test))
预测准确率为: 0.7792207792207793