逻辑回归实战 入学接收问题

记录利用逻辑回归进行学生入学是否接收的问题

模型分析

import pandas as pd
import matplotlib.pyplot as plt
admissions = pd.read_csv("admissions.csv")
print(admissions.head())
plt.scatter(admissions['gpa'], admissions['admit'])
plt.show()

在这里插入图片描述
原数据有两个指标:gpa、gre,一个lable:admit。

#from sklearn.linear_model import LinearRegression
#linear_model = LinearRegression()
#linear_model.fit(admissions[["gpa"]], admissions["admit"])
#与逻辑回归一样的操作
from sklearn.linear_model import LogisticRegression #导库
logistic_model = LogisticRegression() #拿模型
logistic_model.fit(admissions[["gpa"]], admissions["admit"]) #fit数据集
#根据gpa算是否接收的概率
logistic_model.predict_proba(admissions[["gpa"]]) #根据gpa预测接收概率
plt.scatter(admissions["gpa"], pred_probs[:,1]) #1:被接收
plt.show()

在这里插入图片描述

#根据gpa进行是否接收的0、1分类
fitted_labels = logistic_model.predict(admissions[["gpa"]])
plt.scatter(admissions["gpa"], fitted_labels)
plt.show()

在这里插入图片描述

完成分析,最后这里添加一个重要的sigmold函数实现,方便复习:

import numpy as np

# Logit Function
def logit(x):
    # np.exp(x) raises x to the exponential power, ie e^x. e ~= 2.71828
    return np.exp(x)  / (1 + np.exp(x)) 
    
# Generate 50 real values, evenly spaced, between -6 and 6.
x = np.linspace(-6,6,50, dtype=float)

# Transform each number in t using the logit function.
y = logit(x)

# Plot the resulting data.
plt.plot(x, y)
plt.ylabel("Probability")
plt.show()

在这里插入图片描述

模型评估

光对数据集分析还不行,需要看看模型用的怎么样

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

admissions = pd.read_csv("admissions.csv")
model = LogisticRegression()
model.fit(admissions[["gpa"]], admissions["admit"])

labels = model.predict(admissions[["gpa"]]) #得到预测的label
admissions["predicted_label"] = labels #给原数据集加上一列
print(admissions["predicted_label"].value_counts())  #看预测的0、1有多少个
print(admissions.head())

0 507
1 137
Name: predicted_label, dtype: int64
admit gpa gre predicted_label
0 0 3.177277 594.102992 0
1 0 3.412655 631.528607 0
2 0 2.728097 553.714399 0
3 0 3.093559 551.089985 0
4 0 3.141923 537.184894 0

现在有了预测值,就可以和真实值进行比较,最常想到的就是精度

#看精度怎么样
matches = admissions["predicted_label"] == admissions["admit"]
correct_predictions = admissions[matches] #预测对的
accuracy = len(correct_predictions) / float(len(admissions))
print(accuracy)

0.6847826086956522

对于二分类的逻辑回归来说,68%已经可以啦,但是很多时候精度是骗人的,不能只拿精度看,在西瓜书中讲到,有TP、TN、FN、FP四个指标。

TP:预测的是1,并且预测对了
TN:预测的是0,并且预测对了
FN:预测的是0,但预测错了
FP:预测的是1,但预测错了

True Positive Rate:TPR = TP/(TP + FN)
含义:看正例里面有多少预测对了

True Negative Rate:TNR = TN/(FP + TN)
含义:看负例里面有多少预测对了

true_positive_filter = (admissions["predicted_label"] == 1) & (admissions["actual_label"] == 1)
true_positives = len(admissions[true_positive_filter])

false_negative_filter = (admissions["predicted_label"] == 0) & (admissions["actual_label"] == 1)
false_negatives = len(admissions[false_negative_filter])

true_negative_filter = (admissions["predicted_label"] == 0) & (admissions["actual_label"] == 0)
true_negatives = len(admissions[true_negative_filter])

false_positive_filter = (admissions["predicted_label"] == 1) & (admissions["actual_label"] == 0)
false_positives = len(admissions[false_positive_filter])
#--------------------------------------------------------------

sensitivity = true_positives / float((true_positives + false_negatives))
specificity = (true_negatives) / float((false_positives + true_negatives))

print(sensitivity)
print(specificity)

0.36475409836065575
100人应该被录取但是只录取了36个,说明模型检测正例效果不好

0.88
100人本来都不能录取但是录取了12个,说明模型检测负例效果好


改进

发现之前我们是直接用所有的数据进行训练,这样并不好。最常使用的是方法是交叉验证

import pandas as pd
import numpy as np

admissions = pd.read_csv("admissions.csv")
admissions["actual_label"] = admissions["admit"]
admissions = admissions.drop("admit", axis=1) #删除一列admit

shuffled_index = np.random.permutation(admissions.index) #对数据集进行洗牌
shuffled_admissions = admissions.loc[shuffled_index]
admissions = shuffled_admissions.reset_index()
#再对数据集进行分块
admissions.loc[0:128, "fold"] = 1
admissions.loc[129:257, "fold"] = 2
admissions.loc[258:386, "fold"] = 3
admissions.loc[387:514, "fold"] = 4
admissions.loc[515:644, "fold"] = 5
# Ensure the column is set to integer type.
admissions["fold"] = admissions["fold"].astype('int')

print(admissions.head())
print(admissions.tail())

index gpa gre actual_label fold
0 65 3.481769 619.293122 0 1
1 556 2.914923 795.182376 1 1
2 142 3.497400 696.870625 0 1
3 381 3.202438 610.796489 0 1
4 12 3.018922 567.714830 0 1
index gpa gre actual_label fold
639 207 2.973904 561.087115 0 5
640 188 3.229678 563.682408 0 5
641 355 2.749544 639.758145 0 5
642 80 3.278169 633.001482 0 5
643 232 2.941825 598.214415 0 5

#交叉验证中的一次
from sklearn.linear_model import LogisticRegression
# Training
model = LogisticRegression()
train_iteration_one = admissions[admissions["fold"] != 1] #其他当训练集
test_iteration_one = admissions[admissions["fold"] == 1] #1当验证集
model.fit(train_iteration_one[["gpa"]], train_iteration_one["actual_label"])

# Predicting
labels = model.predict(test_iteration_one[["gpa"]])
test_iteration_one["predicted_label"] = labels

#看下精度
matches = test_iteration_one["predicted_label"] == test_iteration_one["actual_label"]
correct_predictions = test_iteration_one[matches]
iteration_one_accuracy = len(correct_predictions) / float(len(test_iteration_one))
print(iteration_one_accuracy)

0.6434108527131783

#交叉验证过程,每次把一个训练集的一个块当验证集
import numpy as np
fold_ids = [1,2,3,4,5]
def train_and_test(df, folds):
    fold_accuracies = []
    for fold in folds:
        model = LogisticRegression()
        train = admissions[admissions["fold"] != fold]
        test = admissions[admissions["fold"] == fold]
        model.fit(train[["gpa"]], train["actual_label"])
        labels = model.predict(test[["gpa"]])
        test["predicted_label"] = labels

        matches = test["predicted_label"] == test["actual_label"]
        correct_predictions = test[matches]
        fold_accuracies.append(len(correct_predictions) / float(len(test)))
    return(fold_accuracies)

accuracies = train_and_test(admissions, fold_ids)
print(accuracies)
average_accuracy = np.mean(accuracies)  #取均值
print(average_accuracy)

[0.6434108527131783, 0.689922480620155, 0.6589147286821705, 0.71875, 0.7131782945736435]
0.6848352713178294

上面是自己实现交叉验证的过程,其实python已经提供了交叉验证的库,平常可以直接调用

from sklearn.model_selection import KFold #导入交叉验证模块
from sklearn.model_selection import cross_val_score

admissions = pd.read_csv("admissions.csv")
admissions["actual_label"] = admissions["admit"]
admissions = admissions.drop("admit", axis=1)

kf = KFold(len(admissions), n_splits=5, shuffle=True, random_state=8)
lr = LogisticRegression()
#roc_auc 
#score 指定accuracy或roc_auc(面积)
#accuracies = cross_val_score(lr,admissions[["gpa"]], admissions["actual_label"], scoring="accuracy", cv=kf)
accuracies = cross_val_score(lr,admissions[["gpa"]], admissions["actual_label"])
average_accuracy = sum(accuracies) / len(accuracies)

print(accuracies)
print(average_accuracy)

[0.62015504 0.71317829 0.63565891 0.75193798 0.7109375 ]
0.686373546511628


记录一下过程中的error

  1. ‘DataFrame’ object has no attribute ‘ix’
    原因:新版本移除了Series.ix and DataFrame.ix 方法。
    方法:ix改成iloc,比如
admissions.ix[0:128, "fold"] = 1

换为

admissions.iloc[0:128, "fold"] = 1
  1. No module named ‘sklearn.cross_validation’
    原因:sklearn中已经废弃cross_validation,将其中的内容整合到model_selection
    方法:cross_validation改成model_selection,比如
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score

换为

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
评论 8
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值