Titanic----6

# 数据分析和处理
import numpy as np
import pandas as pd

# 数据可视化
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
combine = [train_df, test_df]
print(train_df.columns)
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
train_df.head()
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
train_df.info()
print('_'*40)
test_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
________________________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
train_df.describe()
PassengerIdSurvivedPclassAgeSibSpParchFare
count891.000000891.000000891.000000714.000000891.000000891.000000891.000000
mean446.0000000.3838382.30864229.6991180.5230080.38159432.204208
std257.3538420.4865920.83607114.5264971.1027430.80605749.693429
min1.0000000.0000001.0000000.4200000.0000000.0000000.000000
25%223.5000000.0000002.00000020.1250000.0000000.0000007.910400
50%446.0000000.0000003.00000028.0000000.0000000.00000014.454200
75%668.5000001.0000003.00000038.0000001.0000000.00000031.000000
max891.0000001.0000003.00000080.0000008.0000006.000000512.329200
train_df.describe(include='O')
NameSexTicketCabinEmbarked
count891891891204889
unique89126811473
topFord, Mr. William Nealmale347082G6S
freq157774644
train_df[['Pclass', 'Survived']].groupby(['Pclass'],as_index=False)\
.mean().sort_values(by='Survived',ascending=False)
PclassSurvived
010.629630
120.472826
230.242363
train_df.groupby(['Sex'])['Sex','Survived'].mean()
Survived
Sex
female0.742038
male0.188908
train_df[['Sex', 'Survived']].groupby(['Sex'],as_index=False)\
.mean().sort_values(by='Survived',ascending=False)
SexSurvived
0female0.742038
1male0.188908
train_df[['SibSp', 'Survived']].groupby(['SibSp'],as_index=False)\
.mean().sort_values(by='Survived',ascending=False)
SibSpSurvived
110.535885
220.464286
000.345395
330.250000
440.166667
550.000000
680.000000
train_df[['Parch', 'Survived']].groupby(['Parch'],as_index=False)\
.mean().sort_values(by='Survived',ascending=False)
ParchSurvived
330.600000
110.550847
220.500000
000.343658
550.200000
440.000000
660.000000
g = sns.FacetGrid(train_df, col='Survived')
g.map(plt.hist, 'Age', bins=20)  #bins 直方数量
<seaborn.axisgrid.FacetGrid at 0x1147ea9b0>

png

grid = sns.FacetGrid(train_df, col='Survived', row='Pclass', size=2.2, aspect=1.6)
grid.map(plt.hist, 'Age', alpha=.5, bins=20)  #bins表示直方数量, alpha表示颜色的深浅程度
grid.add_legend() # legend:图例
<seaborn.axisgrid.FacetGrid at 0x1149180b8>

png

grid = sns.FacetGrid(train_df, row='Embarked', size=2.2, aspect=1.6)
grid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette='deep')
grid.add_legend()
/Users/shenxin/anaconda3/lib/python3.6/site-packages/seaborn/axisgrid.py:703: UserWarning: Using the pointplot function without specifying `order` is likely to produce an incorrect plot.
  warnings.warn(warning)
/Users/shenxin/anaconda3/lib/python3.6/site-packages/seaborn/axisgrid.py:708: UserWarning: Using the pointplot function without specifying `hue_order` is likely to produce an incorrect plot.
  warnings.warn(warning)





<seaborn.axisgrid.FacetGrid at 0x11c9b0dd8>

png

grid = sns.FacetGrid(train_df, row='Embarked', col='Survived', size=2.2, aspect=1.6)
grid.map(sns.barplot, 'Sex', 'Fare', alpha=.5, ci=None)
grid.add_legend()
/Users/shenxin/anaconda3/lib/python3.6/site-packages/seaborn/axisgrid.py:703: UserWarning: Using the barplot function without specifying `order` is likely to produce an incorrect plot.
  warnings.warn(warning)





<seaborn.axisgrid.FacetGrid at 0x11ce0b8d0>

png

print("Before", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape)
Before (891, 12) (418, 11) (891, 12) (418, 11)
# 无关特征删除
train_df = train_df.drop(['Ticket', 'Cabin', 'Name'], axis=1)
test_df = test_df.drop(['Ticket', 'Cabin', 'Name'], axis=1)
combine = [train_df, test_df]
print("After", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape)
After (891, 9) (418, 8) (891, 9) (418, 8)
# 分类特征转换为数值特征
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map({'female':1, 'male':0}).astype(int)
train_df.head()
PassengerIdSurvivedPclassSexAgeSibSpParchFareEmbarked
0103022.0107.2500S
1211138.01071.2833C
2313126.0007.9250S
3411135.01053.1000S
4503035.0008.0500S
# 数值特征缺失值处理
guess_ages = np.zeros((2,3))
guess_ages
array([[0., 0., 0.],
       [0., 0., 0.]])
for dataset in combine:
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = dataset[(dataset['Sex'] == i) & \
                                   (dataset['Pclass'] == j+1)]['Age'].dropna()
            age_guess = guess_df.median()
            guess_ages[i, j] = int(age_guess/0.5 + 0.5) * 0.5

    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[(dataset.Age.isnull()) & (dataset.Sex ==i) & ( dataset.Pclass == j+1),\
                                                                    ['Age']] = guess_ages[i, j]
train_df.head()       
PassengerIdSurvivedPclassSexAgeSibSpParchFareEmbarked
0103022.0107.2500S
1211138.01071.2833C
2313126.0007.9250S
3411135.01053.1000S
4503035.0008.0500S
# 连续数值转为分类特征
train_df['AgeBand'] = pd.cut(train_df['Age'], 5)   # 按数值值等分,区别 qcut()按数值个数等分
train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False). \
                                mean().sort_values(by='AgeBand', ascending=True)
AgeBandSurvived
0(0.34, 16.336]0.550000
1(16.336, 32.252]0.336714
2(32.252, 48.168]0.412844
3(48.168, 64.084]0.434783
4(64.084, 80.0]0.090909
for dataset in combine:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age']
train_df.head()
PassengerIdSurvivedPclassSexAgeSibSpParchFareEmbarkedAgeBand
010301.0107.2500S(16.336, 32.252]
121112.01071.2833C(32.252, 48.168]
231311.0007.9250S(16.336, 32.252]
341112.01053.1000S(32.252, 48.168]
450302.0008.0500S(32.252, 48.168]
train_df = train_df.drop(['AgeBand'], axis=1)
combine = [train_df, test_df]
train_df.head()
PassengerIdSurvivedPclassSexAgeSibSpParchFareEmbarked
010301.0107.2500S
121112.01071.2833C
231311.0007.9250S
341112.01053.1000S
450302.0008.0500S
# 分类特征缺失值处理(只有两个,所以按最常用的填补)
freq_port = train_df.Embarked.dropna().mode()[0]   # 最常见值
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)

train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().\
                                                sort_values(by='Survived', ascending=False)
EmbarkedSurvived
0C0.553571
1Q0.389610
2S0.339009
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
train_df.head()
PassengerIdSurvivedPclassSexAgeSibSpParchFareEmbarked
010301.0107.25000
121112.01071.28331
231311.0007.92500
341112.01053.10000
450302.0008.05000
# 缺失较少,取中值
test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)   
# 将票价离散化
train_df['FareBand'] = pd.qcut(train_df['Fare'], 4)
train_df[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().\
                                            sort_values(by='FareBand', ascending=True)
FareBandSurvived
0(-0.001, 7.91]0.197309
1(7.91, 14.454]0.303571
2(14.454, 31.0]0.454955
3(31.0, 512.329]0.581081
for dataset in combine:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

train_df = train_df.drop(['FareBand'], axis=1)
combine = [train_df, test_df]
    
train_df.head(10)
PassengerIdSurvivedPclassSexAgeSibSpParchFareEmbarked
010301.01000
121112.01031
231311.00010
341112.01030
450302.00010
560301.00012
670103.00030
780300.03120
891311.00210
9101210.01021
# 尝试创建新特征
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().\
                                                sort_values(by='Survived', ascending=False)

FamilySizeSurvived
340.724138
230.578431
120.552795
670.333333
010.303538
450.200000
560.136364
780.000000
8110.000000
for dataset in combine:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

train_df[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()
IsAloneSurvived
000.505650
110.303538
train_df = train_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
test_df = test_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
combine = [train_df, test_df]

train_df.head()
PassengerIdSurvivedPclassSexAgeFareEmbarkedIsAlone
010301.0000
121112.0310
231311.0101
341112.0300
450302.0101
# 数据准备
X_train = train_df.drop(["Survived","PassengerId"], axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop(["PassengerId"], axis=1)
X_train.shape, Y_train.shape, X_test.shape
((891, 6), (891,), (418, 6))
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
# Logistic Regression
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


logistic = LogisticRegression()
logistic.fit(X_train, Y_train)
y_pred = logistic.predict(X_train)
acc_log = round(logistic.score(X_train, Y_train) * 100, 2)
print(acc_log)

print('把所有数据当作是训练集')
print('逻辑回归的准确率为:{}'.format(logistic.score(X_train, Y_train)))
y_pred = logistic.predict(X_train)

print('逻辑回归的精确率为:{}'.format(precision_score(Y_train, y_pred)))
print('逻辑回归的召回率为:{}'.format(recall_score(Y_train, y_pred)))
print('逻辑回归的F1-score为:{}'.format(f1_score(Y_train, y_pred)))

fpr, tpr, _ = roc_curve(Y_train, logistic.predict_proba(X_train)[:,1])
roc_auc = auc(fpr, tpr)

# Plot of a ROC curve for a specific class
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()


78.56
把所有数据当作是训练集
逻辑回归的准确率为:0.7856341189674523
逻辑回归的精确率为:0.7267267267267268
逻辑回归的召回率为:0.7076023391812866
逻辑回归的F1-score为:0.7170370370370371

png

svc = SVC(probability=True)
svc.fit(X_train, Y_train)
y_pred = svc.predict(X_train)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc

print('把所有数据当作是训练集')
print('SVM的准确率为:{}'.format(svc.score(X_train, Y_train)))
y_pred = svc.predict(X_train)

print('SVM的精确率为:{}'.format(precision_score(Y_train, y_pred)))
print('SVM的召回率为:{}'.format(recall_score(Y_train, y_pred)))
print('SVM的F1-score为:{}'.format(f1_score(Y_train, y_pred)))

fpr, tpr, _ = roc_curve(Y_train, svc.predict_proba(X_train)[:,1])
roc_auc = auc(fpr, tpr)

# Plot of a ROC curve for a specific class
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

把所有数据当作是训练集
SVM的准确率为:0.8215488215488216
SVM的精确率为:0.8279569892473119
SVM的召回率为:0.6754385964912281
SVM的F1-score为:0.7439613526570048

png

# KNN

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn
83.61
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
acc_gaussian
75.31
# Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron
/Users/shenxin/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/stochastic_gradient.py:128: FutureWarning: max_iter and tol parameters have been added in <class 'sklearn.linear_model.perceptron.Perceptron'> in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3.
  "and default tol will be 1e-3." % type(self), FutureWarning)





77.1
# Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
acc_linear_svc
78.56
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd
/Users/shenxin/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/stochastic_gradient.py:128: FutureWarning: max_iter and tol parameters have been added in <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3.
  "and default tol will be 1e-3." % type(self), FutureWarning)





65.88
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

print('把所有数据当作是训练集')
print('决策树的准确率为:{}'.format(decision_tree.score(X_train, Y_train)))
y_pred = decision_tree.predict(X_train)

print('决策树的精确率为:{}'.format(precision_score(Y_train, y_pred)))
print('决策树的召回率为:{}'.format(recall_score(Y_train, y_pred)))
print('决策树的F1-score为:{}'.format(f1_score(Y_train, y_pred)))

fpr, tpr, _ = roc_curve(Y_train, decision_tree.predict_proba(X_train)[:,1])
roc_auc = auc(fpr, tpr)

# Plot of a ROC curve for a specific class
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

把所有数据当作是训练集
决策树的准确率为:0.8574635241301908
决策树的精确率为:0.8825622775800712
决策树的召回率为:0.7251461988304093
决策树的F1-score为:0.7961476725521669

png

# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest
85.75
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)
ModelScore
3Random Forest85.75
8Decision Tree85.75
1KNN83.61
0Support Vector Machines82.15
2Logistic Regression78.56
7Linear SVC78.56
5Perceptron77.10
4Naive Bayes75.31
6Stochastic Gradient Decent65.88
from sklearn.model_selection import cross_val_score
# 随机森林

from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier()
random_forest.fit(X_train, Y_train)
random_forest.score(X_train, Y_train)
print('随机森林的结果是:', cross_val_score(random_forest,X_train, Y_train))

print('把所有数据当作是训练集')
print('随机森林的准确率为:{}'.format(random_forest.score(X_train, Y_train)))
y_pred = random_forest.predict(X_train)

print('随机森林的精确率为:{}'.format(precision_score(Y_train, y_pred)))
print('随机森林的召回率为:{}'.format(recall_score(Y_train, y_pred)))
print('随机森林的F1-score率为:{}'.format(f1_score(Y_train, y_pred)))

fpr, tpr, _ = roc_curve(Y_train, random_forest.predict_proba(X_train)[:,1])
roc_auc = auc(fpr, tpr)

# Plot of a ROC curve for a specific class
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()
随机森林的结果是: [0.77104377 0.79461279 0.80808081]
把所有数据当作是训练集
随机森林的准确率为:0.856341189674523
随机森林的精确率为:0.8519736842105263
随机森林的召回率为:0.7573099415204678
随机森林的F1-score率为:0.8018575851393188

png

# XGboost

from xgboost import XGBClassifier

xgbc = XGBClassifier()
xgbc.fit(X_train, Y_train)
xgbc.score(X_train, Y_train)
print('xgboost的结果是:', cross_val_score(random_forest,X_train, Y_train))

print('把所有数据当作是训练集 ')
print('xgboost的准确率为:{}'.format(xgbc.score(X_train, Y_train)))
y_pred = xgbc.predict(X_train)

print('xgboost的精确率为:{}'.format(precision_score(Y_train, y_pred)))
print('xgboost的召回率为:{}'.format(recall_score(Y_train, y_pred)))
print('xgboost的F1-score为:{}'.format(f1_score(Y_train, y_pred)))

fpr, tpr, _ = roc_curve(Y_train, xgbc.predict_proba(X_train)[:,1])
roc_auc = auc(fpr, tpr)

# Plot of a ROC curve for a specific class
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()
xgboost的结果是: [0.78114478 0.80808081 0.82154882]
把所有数据当作是训练集 
xgboost的准确率为:0.8395061728395061
xgboost的精确率为:0.8419243986254296
xgboost的召回率为:0.716374269005848
xgboost的F1-score为:0.7740916271721958


/Users/shenxin/anaconda3/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
/Users/shenxin/anaconda3/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
/Users/shenxin/anaconda3/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:

png

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

logistic = LogisticRegression()
logistic.fit(X_train, Y_train)
print('如果把所有数据当作是训练集,并对其进行预测的结果如下所示:')
print('逻辑回归的准确率为:{}'.format(logistic.score(X_train, Y_train)))
y_pred = logistic.predict(X_train)

print('逻辑回归的精确率为:{}'.format(precision_score(Y_train, y_pred)))
print('逻辑回归的召回率为:{}'.format(recall_score(Y_train, y_pred)))
print('逻辑回归的F1-score为:{}'.format(f1_score(Y_train, y_pred)))

fpr, tpr, _ = roc_curve(Y_train, logistic.predict_proba(X_train)[:,1])
roc_auc = auc(fpr, tpr)

# Plot of a ROC curve for a specific class
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()
如果把所有数据当作是训练集,并对其进行预测的结果如下所示:
逻辑回归的准确率为:0.7856341189674523
逻辑回归的精确率为:0.7267267267267268
逻辑回归的召回率为:0.7076023391812866
逻辑回归的F1-score为:0.7170370370370371

png

# 模型调优

from sklearn.model_selection import GridSearchCV
# 逻辑回归

logistic = LogisticRegression()

param_grid = {'C': [0.45, 0.5, 1]}   #C 正则化系数的倒数
grid_logistic = GridSearchCV(estimator = logistic, param_grid=param_grid, cv = 5, n_jobs=-1)
grid_logistic.fit(X_train, Y_train)
print(grid_logistic.best_params_)
print(grid_logistic.best_estimator_)
print('逻辑回归调优后的分类准确率为{}'.format(grid_logistic.score(X_train, Y_train)))

{'C': 1}
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
逻辑回归调优后的分类准确率为0.7856341189674523





LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
# svm

svm = SVC()

param_grid = {'C': [0.45, 0.5, 1]}
grid_svm = GridSearchCV(estimator = svm, param_grid=param_grid, cv = 5, n_jobs=-1)
grid_svm.fit(X_train, Y_train)
print(grid_svm.best_params_)
print(grid_svm.best_estimator_)
print('SVM调优后的分类准确率为{}'.format(grid_svm.score(X_train, Y_train)))

{'C': 1}
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
SVM调优后的分类准确率为0.8215488215488216





SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
# 决策树

decision_tree = DecisionTreeClassifier()

param_grid = {'splitter': ['best', 'random'], 'max_features': ['auto', 'sqrt', 'log2', None],
             'max_depth': [5, 10, 20]}
grid_decision_tree = GridSearchCV(estimator = decision_tree, param_grid=param_grid, cv = 5, n_jobs=-1)
grid_decision_tree.fit(X_train, Y_train)

print(grid_decision_tree.best_params_)
print(grid_decision_tree.best_estimator_)
print('决策树调优后的分类准确率为{}'.format(grid_decision_tree.score(X_train, Y_train)))

{'max_depth': 5, 'max_features': None, 'splitter': 'best'}
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
决策树调优后的分类准确率为0.8294051627384961





DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
# 随机森林

random_forest = RandomForestClassifier()

param_grid = {'n_estimators': [10, 20, 30, 40, 50], 'max_features': ['auto', 'sqrt', 'log2', None],
             'max_depth': [5, 10, 20]}
grid_random_forest = GridSearchCV(estimator = random_forest, param_grid=param_grid, cv = 5, n_jobs=-1)
grid_random_forest.fit(X_train, Y_train)

print(grid_random_forest.best_params_)
print(grid_random_forest.best_estimator_)
print('随机森林调优后的分类准确率为{}'.format(grid_random_forest.score(X_train, Y_train)))

{'max_depth': 5, 'max_features': 'log2', 'n_estimators': 10}
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='log2', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
随机森林调优后的分类准确率为0.8316498316498316





DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
# XGBoost

from xgboost import XGBClassifier
xgbc = XGBClassifier()

param_grid = {'n_estimators':[1, 5, 10, 20, 40]}
grid_xgboost = GridSearchCV(estimator = xgbc, param_grid=param_grid, cv = 5 )


a, b, c, d = train_test_split(X_train, Y_train, test_size=.25)
grid_xgboost.fit(a, c, early_stopping_rounds = 5, eval_set = [(b, d)]) #early stooping

print(grid_random_forest.best_params_)
print(grid_random_forest.best_estimator_)
print('xgboost调优后的分类准确率为{}'.format(grid_xgboost.score(X_train, Y_train)))


{'max_depth': 5, 'max_features': 'log2', 'n_estimators': 10}
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='log2', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
xgboost调优后的分类准确率为0.8114478114478114
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值