记录利用逻辑回归进行学生入学是否接收的问题
模型分析
import pandas as pd
import matplotlib.pyplot as plt
admissions = pd.read_csv("admissions.csv")
print(admissions.head())
plt.scatter(admissions['gpa'], admissions['admit'])
plt.show()
原数据有两个指标:gpa、gre,一个lable:admit。
#from sklearn.linear_model import LinearRegression
#linear_model = LinearRegression()
#linear_model.fit(admissions[["gpa"]], admissions["admit"])
#与逻辑回归一样的操作
from sklearn.linear_model import LogisticRegression #导库
logistic_model = LogisticRegression() #拿模型
logistic_model.fit(admissions[["gpa"]], admissions["admit"]) #fit数据集
#根据gpa算是否接收的概率
logistic_model.predict_proba(admissions[["gpa"]]) #根据gpa预测接收概率
plt.scatter(admissions["gpa"], pred_probs[:,1]) #1:被接收
plt.show()
#根据gpa进行是否接收的0、1分类
fitted_labels = logistic_model.predict(admissions[["gpa"]])
plt.scatter(admissions["gpa"], fitted_labels)
plt.show()
完成分析,最后这里添加一个重要的sigmold函数实现,方便复习:
import numpy as np
# Logit Function
def logit(x):
# np.exp(x) raises x to the exponential power, ie e^x. e ~= 2.71828
return np.exp(x) / (1 + np.exp(x))
# Generate 50 real values, evenly spaced, between -6 and 6.
x = np.linspace(-6,6,50, dtype=float)
# Transform each number in t using the logit function.
y = logit(x)
# Plot the resulting data.
plt.plot(x, y)
plt.ylabel("Probability")
plt.show()
模型评估
光对数据集分析还不行,需要看看模型用的怎么样
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
admissions = pd.read_csv("admissions.csv")
model = LogisticRegression()
model.fit(admissions[["gpa"]], admissions["admit"])
labels = model.predict(admissions[["gpa"]]) #得到预测的label
admissions["predicted_label"] = labels #给原数据集加上一列
print(admissions["predicted_label"].value_counts()) #看预测的0、1有多少个
print(admissions.head())
0 507
1 137
Name: predicted_label, dtype: int64
admit gpa gre predicted_label
0 0 3.177277 594.102992 0
1 0 3.412655 631.528607 0
2 0 2.728097 553.714399 0
3 0 3.093559 551.089985 0
4 0 3.141923 537.184894 0
现在有了预测值,就可以和真实值进行比较,最常想到的就是精度
#看精度怎么样
matches = admissions["predicted_label"] == admissions["admit"]
correct_predictions = admissions[matches] #预测对的
accuracy = len(correct_predictions) / float(len(admissions))
print(accuracy)
0.6847826086956522
对于二分类的逻辑回归来说,68%已经可以啦,但是很多时候精度是骗人的,不能只拿精度看,在西瓜书中讲到,有TP、TN、FN、FP四个指标。
TP:预测的是1,并且预测对了
TN:预测的是0,并且预测对了
FN:预测的是0,但预测错了
FP:预测的是1,但预测错了
True Positive Rate:TPR = TP/(TP + FN)
含义:看正例里面有多少预测对了
True Negative Rate:TNR = TN/(FP + TN)
含义:看负例里面有多少预测对了
true_positive_filter = (admissions["predicted_label"] == 1) & (admissions["actual_label"] == 1)
true_positives = len(admissions[true_positive_filter])
false_negative_filter = (admissions["predicted_label"] == 0) & (admissions["actual_label"] == 1)
false_negatives = len(admissions[false_negative_filter])
true_negative_filter = (admissions["predicted_label"] == 0) & (admissions["actual_label"] == 0)
true_negatives = len(admissions[true_negative_filter])
false_positive_filter = (admissions["predicted_label"] == 1) & (admissions["actual_label"] == 0)
false_positives = len(admissions[false_positive_filter])
#--------------------------------------------------------------
sensitivity = true_positives / float((true_positives + false_negatives))
specificity = (true_negatives) / float((false_positives + true_negatives))
print(sensitivity)
print(specificity)
0.36475409836065575
100人应该被录取但是只录取了36个,说明模型检测正例效果不好
0.88
100人本来都不能录取但是录取了12个,说明模型检测负例效果好
改进
发现之前我们是直接用所有的数据进行训练,这样并不好。最常使用的是方法是交叉验证
。
import pandas as pd
import numpy as np
admissions = pd.read_csv("admissions.csv")
admissions["actual_label"] = admissions["admit"]
admissions = admissions.drop("admit", axis=1) #删除一列admit
shuffled_index = np.random.permutation(admissions.index) #对数据集进行洗牌
shuffled_admissions = admissions.loc[shuffled_index]
admissions = shuffled_admissions.reset_index()
#再对数据集进行分块
admissions.loc[0:128, "fold"] = 1
admissions.loc[129:257, "fold"] = 2
admissions.loc[258:386, "fold"] = 3
admissions.loc[387:514, "fold"] = 4
admissions.loc[515:644, "fold"] = 5
# Ensure the column is set to integer type.
admissions["fold"] = admissions["fold"].astype('int')
print(admissions.head())
print(admissions.tail())
index gpa gre actual_label fold
0 65 3.481769 619.293122 0 1
1 556 2.914923 795.182376 1 1
2 142 3.497400 696.870625 0 1
3 381 3.202438 610.796489 0 1
4 12 3.018922 567.714830 0 1
index gpa gre actual_label fold
639 207 2.973904 561.087115 0 5
640 188 3.229678 563.682408 0 5
641 355 2.749544 639.758145 0 5
642 80 3.278169 633.001482 0 5
643 232 2.941825 598.214415 0 5
#交叉验证中的一次
from sklearn.linear_model import LogisticRegression
# Training
model = LogisticRegression()
train_iteration_one = admissions[admissions["fold"] != 1] #其他当训练集
test_iteration_one = admissions[admissions["fold"] == 1] #1当验证集
model.fit(train_iteration_one[["gpa"]], train_iteration_one["actual_label"])
# Predicting
labels = model.predict(test_iteration_one[["gpa"]])
test_iteration_one["predicted_label"] = labels
#看下精度
matches = test_iteration_one["predicted_label"] == test_iteration_one["actual_label"]
correct_predictions = test_iteration_one[matches]
iteration_one_accuracy = len(correct_predictions) / float(len(test_iteration_one))
print(iteration_one_accuracy)
0.6434108527131783
#交叉验证过程,每次把一个训练集的一个块当验证集
import numpy as np
fold_ids = [1,2,3,4,5]
def train_and_test(df, folds):
fold_accuracies = []
for fold in folds:
model = LogisticRegression()
train = admissions[admissions["fold"] != fold]
test = admissions[admissions["fold"] == fold]
model.fit(train[["gpa"]], train["actual_label"])
labels = model.predict(test[["gpa"]])
test["predicted_label"] = labels
matches = test["predicted_label"] == test["actual_label"]
correct_predictions = test[matches]
fold_accuracies.append(len(correct_predictions) / float(len(test)))
return(fold_accuracies)
accuracies = train_and_test(admissions, fold_ids)
print(accuracies)
average_accuracy = np.mean(accuracies) #取均值
print(average_accuracy)
[0.6434108527131783, 0.689922480620155, 0.6589147286821705, 0.71875, 0.7131782945736435]
0.6848352713178294
上面是自己实现交叉验证的过程,其实python已经提供了交叉验证的库,平常可以直接调用
from sklearn.model_selection import KFold #导入交叉验证模块
from sklearn.model_selection import cross_val_score
admissions = pd.read_csv("admissions.csv")
admissions["actual_label"] = admissions["admit"]
admissions = admissions.drop("admit", axis=1)
kf = KFold(len(admissions), n_splits=5, shuffle=True, random_state=8)
lr = LogisticRegression()
#roc_auc
#score 指定accuracy或roc_auc(面积)
#accuracies = cross_val_score(lr,admissions[["gpa"]], admissions["actual_label"], scoring="accuracy", cv=kf)
accuracies = cross_val_score(lr,admissions[["gpa"]], admissions["actual_label"])
average_accuracy = sum(accuracies) / len(accuracies)
print(accuracies)
print(average_accuracy)
[0.62015504 0.71317829 0.63565891 0.75193798 0.7109375 ]
0.686373546511628
记录一下过程中的error
- ‘DataFrame’ object has no attribute ‘ix’
原因:新版本移除了Series.ix and DataFrame.ix 方法。
方法:ix改成iloc,比如
将
admissions.ix[0:128, "fold"] = 1
换为
admissions.iloc[0:128, "fold"] = 1
- No module named ‘sklearn.cross_validation’
原因:sklearn中已经废弃cross_validation,将其中的内容整合到model_selection
方法:cross_validation改成model_selection,比如
将
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
换为
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score