逻辑回归实践

本文介绍了使用Python的sklearn库实现逻辑回归的基本步骤,包括数据预处理、模型训练、预测以及针对糖尿病数据集进行预测实战,同时展示了如何处理多分类问题的不同策略。
摘要由CSDN通过智能技术生成
#生成200条二分类数据(2个特征)
from sklearn.datasets import make_blobs
X,y = make_blobs(n_samples=200,n_features=2,centers=2,random_state=8)
print(X)

运行结果部分展示:

#数据可视化
import matplotlib.pyplot as plt
%matplotlib inline
plt.scatter(X[:,0],X[:,1],c=y,cmap=plt.cm.spring,edgecolors='k')

 运行结果如下:

 梯度下降法实现逻辑回归

#添加全1列
import numpy as np
x_ones = np.ones((X.shape[0],1))
X = np.hstack((X,x_ones))
print(X)

运行结果部分展示:

#拆分数据
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=8)
#查看数据维度
print(X.shape,X_train.shape,X_test.shape)
print(y.shape,y_train.shape,y_test.shape)

运行结果为:

(200, 3) (140, 3) (60, 3)
(200,) (140,) (60,)
#将因变量转向列向量
y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)
print(y_train.shape,y_test.shape)

运行结果为:

(140, 1) (60, 1)
#初始化theta值(n+1个)
theta = np.ones([X_train.shape[1],1])
#设置步长值
alpha = 0.001
#定义sigmoid函数
def sigmoid(z):
    s = 1.0 / (1+np.exp(-z))
    return s
num_iters=10000
for i in range(num_iters):
    h = sigmoid(np.dot(X_train,theta))
    theta = theta - alpha*np.dot(X_train.T,(h-y_train))/140
print(theta)

运行结果为:

[[ 0.65443683]
 [-1.1828222 ]
 [ 0.97980398]]
#预测
pred_y = sigmoid(np.dot(X_test,theta))
pred_y

运行结果部分展示:

#预测结果二值化
pred_y[pred_y>0.5] = 1
pred_y[pred_y<=0.5] = 0
print(pred_y.reshape(1,-1))

运行结果为:

[[0. 0. 1. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0.
  1. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  0. 1. 1. 0. 0. 0. 1. 1. 0. 1. 1. 0.]]
print(y_test.reshape(1,-1))

运行结果为:

[[0 0 1 1 0 0 1 1 0 1 0 0 0 1 1 1 1 1 0 1 1 0 0 0 1 0 0 1 0 1 0 1 1 0 0 0
  1 0 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 1 0 1 1 0]]
print("预测准确率为:",np.sum(pred_y == y_test)/len(y_test))

运行结果为:

预测准确率为: 1.0

kaggle糖尿病预测实战

#导入数据
import pandas as pd
data = pd.read_csv("pima-indians-diabetes.data.csv")
data

运行结果部分展示:

#分离特征变量和分类变量
X = data.iloc[:,:-1]
y = data.iloc[:,-1]
#特征标准化
mu = X.mean(axis=0)
std = X.std(axis=0)
X = (X-mu)/std
#添加全1列
x_ones = np.ones((X.shape[0],1))
X = np.hstack((X,x_ones))
#拆分训练集和测试集
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=8)
#查看数据维度
print(X.shape,X_train.shape,X_test.shape)

运行结果为:

(768, 9) (537, 9) (231, 9)
print(y.shape,y_train.shape,y_test.shape)

运行结果为:

(768,) (537,) (231,)
#将因变量转向列向量
y_train = y_train.values.reshape(-1,1)
y_test = y_test.values.reshape(-1,1)
print(y_train.shape,y_test.shape)

运行结果为:

(537, 1) (231, 1)
#初始化theta值
theta = np.ones([X_train.shape[1],1])
#设置步长值
alpha = 0.001
#定义sigmoid函数
def sigmoid(z):
    s = 1.0 / (1+np.exp(-z))
    return s
num_iters=10000
for i in range(num_iters):
    h = sigmoid(np.dot(X_train,theta))
    theta = theta - alpha*np.dot(X_train.T,(h-y_train))/537
print(theta)

运行结果为:

[[ 0.50767366]
 [ 0.98238212]
 [-0.06812689]
 [ 0.09832227]
 [ 0.13580986]
 [ 0.63046909]
 [ 0.63007112]
 [ 0.32469932]
 [-0.47604037]]
#预测
pred_y = sigmoid(np.dot(X_test,theta))
pred_y

运行结果部分展示:

#预测结果二值化
pred_y[pred_y>0.5] = 1
pred_y[pred_y<=0.5] = 0
print(pred_y.reshape(1,-1))

 运行结果为:

[[0. 1. 0. 1. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 1. 1.
  0. 0. 0. 1. 0. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0.
  0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0.
  1. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 1. 0.
  0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
  1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 1. 0. 1.
  0. 1. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0.
  1. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0. 0. 1. 1. 0. 1. 0.
  0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0.
  0. 0. 0. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 0. 1.]]
print(y_test.reshape(1,-1))

运行结果为:

[[0 1 1 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 1 1 1 0 0 1
  0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 1 1 1 0 0 1 0 1 1 1 0 1 1 0 0 0 0 0 1
  0 0 1 0 0 0 0 1 0 1 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 1 0
  0 1 0 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 1 1
  0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 1 0 1 1 1 0 1 1 0 1 0 0 0 0
  1 0 1 0 1 0 0 1 1 0 1 1 1 0 0 0 1 1 0 0 1 0 1 0 1 0 0 0 0 0 1 0 0 1 0 0
  0 0 0 1 1 0 0 0 1 0 1 0 1 0 1]]
print("预测准确率为:",np.sum(pred_y == y_test)/len(y_test))

运行结果为:

预测准确率为: 0.7619047619047619

sklearn实现逻辑回归

#导入数据
import pandas as pd
data = pd.read_csv("pima-indians-diabetes.data.csv")
#分离特征变量和分类变量
X = data.iloc[:,:-1]
y = data.iloc[:,-1]
#特征标准化
mu = X.mean(axis=0)
std = X.std(axis=0)
X = (X-mu)/std
#拆分训练集和测试集
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=8)

逻辑回归实现三分类

#导入iris数据集
from sklearn.datasets import load_iris
iris = load_iris()
#分离自变量与因变量
X = iris.data
y = iris.target
#拆分训练集与测试集
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=8)
#导入逻辑回归模块
from sklearn.linear_model import LogisticRegression
#三板斧
logis = LogisticRegression()
logis.fit(X_train,y_train)
#模型评估
logis.score(X_test,y_test)

运行结果为:

0.9210526315789473
from sklearn.metrics import classification_report
print(classification_report(y_test,logis.predict(X_test)))

运行结果如下:

#等价于选择参数:multi_class='multinomial',solver='lbfgs'
logis2 = LogisticRegression(multi_class='multinomial',solver='lbfgs')
logis2.fit(X_train,y_train)
logis2.score(X_test,y_test)

运行结果为:

0.9210526315789473
#若选择参数:multi_class='ovr',solver='lbfgs'
logis3 = LogisticRegression(multi_class='ovr',solver='lbfgs')
logis3.fit(X_train,y_train)
logis3.score(X_test,y_test)

运行结果为:

0.8157894736842105
  • 40
    点赞
  • 49
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值