数据分析–模型建立和评估
数据分析
一、模型搭建
1.1导入数据集
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Image
%matplotlib inline
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
plt.rcParams['figure.figsize'] = (10, 6) # 设置输出图片大小
# 读取训练数集
train = pd.read_csv('train.csv')
train.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
1.2读取清洗过的数据集
data = pd.read_csv('clear_data.csv')
data.head()
PassengerId Pclass Age SibSp Parch Fare Sex_female Sex_male Embarked_C Embarked_Q Embarked_S
0 0 3 22.0 1 0 7.2500 0 1 0 0 1
1 1 1 38.0 1 0 71.2833 1 0 1 0 0
2 2 3 26.0 0 0 7.9250 1 0 0 0 1
3 3 1 35.0 1 0 53.1000 1 0 0 0 1
4 4 3 35.0 0 0 8.0500 0 1 0 0 1
1.3切割训练集和测试集
from sklearn.model_selection import train_test_split
X = data
y = train['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
X_train.shape, X_test.shape
((668, 11), (223, 11))
1.4模型创建
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
lr = LogisticRegression()
lr.fit(X_train, y_train)
print("Training set score: {:.2f}".format(lr.score(X_train, y_train)))
print("Testing set score: {:.2f}".format(lr.score(X_test, y_test)))
Training set score: 0.80
Testing set score: 0.79
调整参数后的逻辑回归模型
lr2 = LogisticRegression(C=100)
lr2.fit(X_train, y_train)
print("Training set score: {:.2f}".format(lr2.score(X_train, y_train)))
print("Testing set score: {:.2f}".format(lr2.score(X_test, y_test)))
Training set score: 0.79
Testing set score: 0.78
1.5输出模型预测结果
pred = lr.predict(X_train)
pred_proba = lr.predict_proba(X_train)
pred_proba[:10]
array([[0.60870022, 0.39129978],
[0.17725433, 0.82274567],
[0.40750365, 0.59249635],
[0.18925851, 0.81074149],
[0.87973912, 0.12026088],
[0.91374559, 0.08625441],
[0.13293198, 0.86706802],
[0.90560801, 0.09439199],
[0.05283987, 0.94716013],
[0.10936016, 0.89063984]])
二、模型评估
2.1引入库
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Image
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
plt.rcParams['figure.figsize'] = (10, 6) # 设置输出图片大小
from sklearn.model_selection import train_test_split
data = pd.read_csv('clear_data.csv')
train = pd.read_csv('train.csv')
X = data
y = train['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
lr = LogisticRegression()
lr.fit(X_train, y_train)
2.2交叉验证
from sklearn.model_selection import cross_val_score
lr = LogisticRegression(C=100)
scores = cross_val_score(lr, X_train, y_train, cv=10)
print("Average cross-validation score: {:.2f}".format(scores.mean()))
Average cross-validation score: 0.80
2.3混淆矩阵
from sklearn.metrics import confusion_matrix
lr = LogisticRegression(C=100)
lr.fit(X_train, y_train)
pred = lr.predict(X_train)
confusion_matrix(y_train, pred)
from sklearn.metrics import classification_report
print(classification_report(y_train, pred))
precision recall f1-score support
0 0.81 0.86 0.83 412
1 0.75 0.68 0.71 256
accuracy 0.79 668
macro avg 0.78 0.77 0.77 668
weighted avg 0.79 0.79 0.79 668