逻辑回归的Python实现

最新推荐文章于 2024-11-29 15:54:47 发布

godli_one

最新推荐文章于 2024-11-29 15:54:47 发布

阅读量631

点赞数

本文链接：https://blog.csdn.net/qq_24059779/article/details/105621021

版权

逻辑回归的Python实现

利用Python中sklearn包进行逻辑回归分析。

3.1提出问题
根据已有数据探究“学习时长”与“是否通过考试”之间关系，并建立预测模型。

3.2理解数据

1、导入包和数据

#1.导入包
import warnings
import pandas as pd
import numpy as np
from collections import OrderedDict
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
#2.创建数据（学习时间与是否通过考试）
dataDict={'学习时间':list(np.arange(0.50,5.50,0.25)),
        '考试成绩':[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
dataOrDict=OrderedDict(dataDict)
dataDf=pd.DataFrame(dataOrDict)
dataDf.head()

>>>
       学习时间	考试成绩
0	0.50	0
1	0.75	0
2	1.00	0
3	1.25	0
4	1.50	0

2、查看数据

#查看数据具体形式
dataDf.head()
#查看数据类型及缺失情况
dataDf.info()
>>>
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 2 columns):
学习时间    20 non-null float64
考试成绩    20 non-null int64
dtypes: float64(1), int64(1)
memory usage: 400.0 bytes

#查看描述性统计信息
dataDf.describe()
>>>
        学习时间	 考试成绩
count	20.00000 20.000000
mean	2.87500	0.500000
std	1.47902	0.512989
min	0.50000	0.000000
25%	1.68750	0.000000
50%	2.87500	0.500000
75%	4.06250	1.000000
max	5.25000	1.000000

3、绘制散点图查看数据分布情况

#提取特征和标签
exam_X=dataDf['学习时间']
exam_y=dataDf['考试成绩']
#绘制散点图
plt.scatter(exam_X,exam_y,color='b',label='考试数据')
plt.legend(loc=2)
plt.xlabel('学习时间')
plt.ylabel('考试成绩')
plt.show()

3.3构建模型

1、拆分训练集并利用散点图观察

#1.拆分训练集和测试集
from sklearn.cross_validation import train_test_split
exam_X=exam_X.values.reshape(-1,1)
exam_y=exam_y.values.reshape(-1,1)
train_X,test_X,train_y,test_y=train_test_split(exam_X,exam_y,train_size=0.8)
print('训练集数据大小为',train_X.size,train_y.size)
print('测试集数据大小为',test_X.size,test_y.size)
>>>
训练集数据大小为 16 16
测试集数据大小为 4 4

#2.散点图观察
plt.scatter(train_X,train_y,color='b',label='train data')
plt.scatter(test_X,test_y,color='r',label='test data')
#plt.plot(test_X,pred_y,color='r')
plt.legend(loc=2)
plt.xlabel('Hours')
plt.ylabel('Scores')
plt.show()

2、导入模型

#3.导入模型
from sklearn.linear_model import LogisticRegression
modelLR=LogisticRegression()

#3.导入模型
from sklearn.linear_model import LogisticRegression
modelLR=LogisticRegression()

#3.导入模型
from sklearn.linear_model import LogisticRegression
modelLR=LogisticRegression()

#3.导入模型
from sklearn.linear_model import LogisticRegression
modelLR=LogisticRegression()

#3.导入模型
from sklearn.linear_model import LogisticRegression
modelLR=LogisticRegression()

、训练模型

#4.训练模型
modelLR.fit(train_X,train_y)

3.4模型评估

1、模型评分（即准确率）

modelLR.score(test_X,test_y)
>>>
0.75

2、指定某个点的预测情况

#学习时间确定时，预测为0和1的概率分别为多少？
modelLR.predict_proba(3)
>>>
array([[0.36720478, 0.63279522]])

#学习时间确定时，预测能否通过考试？
modelLR.predict(3)
>>>
array([1])

3、求出逻辑回归函数并绘制曲线
逻辑回归函数

#先求出回归函数y=a+bx，再代入逻辑函数中pred_y=1/(1+np.exp(-y))
b=modelLR.coef_
a=modelLR.intercept_
print('该模型对应的回归函数为:1/(1+exp-(%f+%f*x))'%(a,b))
>>>
该模型对应的回归函数为:1/(1+exp-(-1.527106+0.690444*x))

逻辑回归曲线

#画出相应的逻辑回归曲线
plt.scatter(train_X,train_y,color='b',label='train data')
plt.scatter(test_X,test_y,color='r',label='test data')
plt.plot(test_X,1/(1+np.exp(-(a+b*test_X))),color='r')
plt.plot(exam_X,1/(1+np.exp(-(a+b*exam_X))),color='y')
plt.legend(loc=2)
plt.xlabel('Hours')
plt.ylabel('Scores')
plt.show()

4、得到模型混淆矩阵

from sklearn.metrics import confusion_matrix
#数值处理
pred_y=1/(1+np.exp(-(a+b*test_X)))
pred_y=pd.DataFrame(pred_y)
pred_y=round(pred_y,0).astype(int)
#混淆矩阵
confusion_matrix(test_y.astype(str),pred_y.astype(str))
>>>
array([[1, 1],
       [0, 2]])

从混淆矩阵可以看出：