逻辑回归实战 — Kaggle_Titanic 2

数据来源:https://www.kaggle.com/c/titanic

Training

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

train_data = pd.read_csv('train.csv')
count_survivors = pd.value_counts(train_data['Survived'])
count_survivors.plot(kind='bar')
plt.xlabel('Is_survived')
plt.ylabel('Number of People')
plt.title('Survivor histogram')

survivor_hist

from sklearn.preprocessing import StandardScaler

train_data['Sex'] = train_data['Sex'].map({'female':0, 'male':1})

age_avg = np.mean([0 if np.isnan(item) else item for item in train_data['Age']])
train_data['Age'] = [age_avg if np.isnan(item) else item for item in train_data['Age']]
train_data['Age'] = StandardScaler().fit_transform(train_data['Age'].values.reshape(-1,1))

train_data['SibSp'] = StandardScaler().fit_transform(train_data['SibSp'].values.reshape(-1,1))
train_data['Parch'] = StandardScaler().fit_transform(train_data['Parch'].values.reshape(-1,1))
train_data['Fare'] = StandardScaler().fit_transform(train_data['Fare'].values.reshape(-1,1))

train_data['Embarked'] = train_data['Embarked'].map({'S':1, 'C':2, 'Q':3})
pier = [0 if np.isnan(item) else item for item in train_data['Embarked']]
train_data['Embarked'] = [max(set(pier), key=pier.count) if item == 0 else item for item in pier]

train_data = train_data.drop(columns=['Name','Ticket','Cabin','PassengerId'])
c:\python27\lib\site-packages\sklearn\utils\validation.py:475: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
X = train_data.ix[:, train_data.columns != 'Survived']
Y = train_data.ix[:, train_data.columns == 'Survived']
c:\python27\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold
from sklearn.metrics import recall_score,confusion_matrix

def getBestC(X, Y):
    folds = KFold(len(Y), 5)
    c_param_range = [0.01,0.1,1,10,100]

    results_table = pd.DataFrame(index = range(len(c_param_range),2), columns = ['C_parameter','Mean recall score'])
    results_table['C_parameter'] = c_param_range


    for i in range(len(c_param_range)):
        print '******** c_param = %.2f ********' % c_param_range[i]
        recall_accs = []
        for iteration, fold in enumerate(folds, start=1):
            lr = LogisticRegression(C = c_param_range[i], penalty = 'l1')
            lr.fit(X.iloc[fold[0]].values, Y.iloc[fold[0]].values)
            Y_hat = lr.predict(X.iloc[fold[1]].values)
            recall_acc = recall_score(Y.iloc[fold[1]].values, Y_hat)
            recall_accs.append(recall_acc)

            print 'Iteration %d: recall score = %f' % (iteration,recall_acc)

        results_table.ix[i,'Mean recall score'] = np.mean(recall_accs)
        print '\nMean recall score %f\n' % np.mean(recall_accs)
        
    best_c = results_table.loc[results_table['Mean recall score'].astype('float64').idxmax()]['C_parameter']
    print '--------------------------------\nbest_c = %.2f' % best_c
    return best_c

c:\python27\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
best_c = getBestC(X, Y)
******** c_param = 0.01 ********
Iteration 1: recall score = 0.000000
Iteration 2: recall score = 0.000000
Iteration 3: recall score = 0.000000
Iteration 4: recall score = 0.000000
Iteration 5: recall score = 0.000000

Mean recall score 0.000000

******** c_param = 0.10 ********
Iteration 1: recall score = 0.694915
Iteration 2: recall score = 0.683544
Iteration 3: recall score = 0.681159
Iteration 4: recall score = 0.583333
Iteration 5: recall score = 0.698413

Mean recall score 0.668273

******** c_param = 1.00 ********
Iteration 1: recall score = 0.745763
Iteration 2: recall score = 0.708861
Iteration 3: recall score = 0.710145
Iteration 4: recall score = 0.597222
Iteration 5: recall score = 0.746032

Mean recall score 0.701604

******** c_param = 10.00 ********
Iteration 1: recall score = 0.745763
Iteration 2: recall score = 0.708861
Iteration 3: recall score = 0.739130
Iteration 4: recall score = 0.597222
Iteration 5: recall score = 0.761905

Mean recall score 0.710576

******** c_param = 100.00 ********
Iteration 1: recall score = 0.745763
Iteration 2: recall score = 0.708861
Iteration 3: recall score = 0.739130
Iteration 4: recall score = 0.597222
Iteration 5: recall score = 0.761905

Mean recall score 0.710576

--------------------------------
best_c = 10.00


c:\python27\lib\site-packages\sklearn\utils\validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
c:\python27\lib\site-packages\ipykernel_launcher.py:25: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
import itertools

lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X.values, Y.values)
Y_hat = lr.predict(X.values)

# Compute confusion matrix
cnf_matrix = confusion_matrix(Y, Y_hat)
#np.set_printoptions(precision=2)

print "Recall value in training dataset: %f" % (1.0*cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

# Plot non-normalized confusion matrix
class_names = [0, 1]
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix')
plt.show()
Recall value in training dataset: 0.710526

confusion_matrix

lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X.values, Y.values)
Y_hat_proba = lr.predict_proba(X.values)

thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

plt.figure(figsize=(10,10))

j = 1
for i in thresholds:
    Y_hat = Y_hat_proba[:,1] > i
    
    plt.subplot(3,3,j)
    j += 1
    
    # Compute confusion matrix
    cnf_matrix = confusion_matrix(Y, Y_hat)

    print "Recall value in training dataset: %f, with threshold = %.1f" % ((1.0*cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1])), i)

    # Plot non-normalized confusion matrix
    class_names = [0,1]
    plot_confusion_matrix(cnf_matrix
                          , classes=class_names
                          , title='Threshold >= %s'%i) 
Recall value in training dataset: 0.938596, with threshold = 0.1
Recall value in training dataset: 0.850877, with threshold = 0.2
Recall value in training dataset: 0.824561, with threshold = 0.3
Recall value in training dataset: 0.757310, with threshold = 0.4
Recall value in training dataset: 0.710526, with threshold = 0.5
Recall value in training dataset: 0.646199, with threshold = 0.6
Recall value in training dataset: 0.532164, with threshold = 0.7
Recall value in training dataset: 0.371345, with threshold = 0.8
Recall value in training dataset: 0.204678, with threshold = 0.9

thresholds_cnf

Testing

test_data = pd.read_csv('test.csv')
test_data['Sex'] = test_data['Sex'].map({'female':0, 'male':1})

age_avg = np.mean([0 if np.isnan(item) else item for item in test_data['Age']])
test_data['Age'] = [age_avg if np.isnan(item) else item for item in test_data['Age']]
test_data['Age'] = StandardScaler().fit_transform(test_data['Age'].values.reshape(-1,1))

test_data['SibSp'] = StandardScaler().fit_transform(test_data['SibSp'].values.reshape(-1,1))
test_data['Parch'] = StandardScaler().fit_transform(test_data['Parch'].values.reshape(-1,1))

fare_avg = np.mean([0 if np.isnan(item) else item for item in test_data['Fare']])
test_data['Fare'] = [fare_avg if np.isnan(item) else item for item in test_data['Fare']]
test_data['Fare'] = StandardScaler().fit_transform(test_data['Fare'].values.reshape(-1,1))

test_data['Embarked'] = test_data['Embarked'].map({'S':1, 'C':2, 'Q':3})
pier = [0 if np.isnan(item) else item for item in test_data['Embarked']]
test_data['Embarked'] = [max(set(pier), key=pier.count) if item == 0 else item for item in pier]

test_data = test_data.drop(columns=['Name','Ticket','Cabin'])
test_data.head()
PassengerIdPclassSexAgeSibSpParchFareEmbarked
0892310.428099-0.499470-0.400248-0.4984033
1893301.3994920.616992-0.400248-0.5132711
2894212.565163-0.499470-0.400248-0.4650853
389531-0.154736-0.499470-0.400248-0.4834631
489630-0.5432930.6169920.619896-0.4184681
train_data.head()
SurvivedPclassSexAgeSibSpParchFareEmbarked
0031-0.4942450.432793-0.473674-0.5024451.0
11100.7173070.432793-0.4736740.7868452.0
2130-0.191357-0.474545-0.473674-0.4888541.0
31100.4901410.432793-0.4736740.4207301.0
40310.490141-0.474545-0.473674-0.4863371.0
X_test = test_data.drop(['PassengerId'], axis=1)
X_test.head()
PclassSexAgeSibSpParchFareEmbarked
0310.428099-0.499470-0.400248-0.4984033
1301.3994920.616992-0.400248-0.5132711
2212.565163-0.499470-0.400248-0.4650853
331-0.154736-0.499470-0.400248-0.4834631
430-0.5432930.6169920.619896-0.4184681
lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X.values, Y.values)
Y_hat_proba = lr.predict_proba(X_test.values)
Y_hat = [1 if y > 0.6 else 0 for y in Y_hat_proba[:,1]]

results = pd.DataFrame(Y_hat, columns=['Survived'])
results.insert(0, 'PassengerId', test_data['PassengerId'])
results.to_csv('results.csv')
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值