import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
train_data = pd.read_csv('train.csv')
count_survivors = pd.value_counts(train_data['Survived'])
count_survivors.plot(kind='bar')
plt.xlabel('Is_survived')
plt.ylabel('Number of People')
plt.title('Survivor histogram')
from sklearn.preprocessing import StandardScaler
train_data['Sex']= train_data['Sex'].map({'female':0,'male':1})
age_avg = np.mean([0if np.isnan(item)else item for item in train_data['Age']])
train_data['Age']=[age_avg if np.isnan(item)else item for item in train_data['Age']]
train_data['Age']= StandardScaler().fit_transform(train_data['Age'].values.reshape(-1,1))
train_data['SibSp']= StandardScaler().fit_transform(train_data['SibSp'].values.reshape(-1,1))
train_data['Parch']= StandardScaler().fit_transform(train_data['Parch'].values.reshape(-1,1))
train_data['Fare']= StandardScaler().fit_transform(train_data['Fare'].values.reshape(-1,1))
train_data['Embarked']= train_data['Embarked'].map({'S':1,'C':2,'Q':3})
pier =[0if np.isnan(item)else item for item in train_data['Embarked']]
train_data['Embarked']=[max(set(pier), key=pier.count)if item ==0else item for item in pier]
train_data = train_data.drop(columns=['Name','Ticket','Cabin','PassengerId'])
c:\python27\lib\site-packages\sklearn\utils\validation.py:475: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.
warnings.warn(msg, DataConversionWarning)
X = train_data.ix[:, train_data.columns !='Survived']
Y = train_data.ix[:, train_data.columns =='Survived']
c:\python27\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
"""Entry point for launching an IPython kernel.
c:\python27\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
"This module will be removed in 0.20.", DeprecationWarning)
best_c = getBestC(X, Y)
******** c_param = 0.01 ********
Iteration 1: recall score = 0.000000
Iteration 2: recall score = 0.000000
Iteration 3: recall score = 0.000000
Iteration 4: recall score = 0.000000
Iteration 5: recall score = 0.000000
Mean recall score 0.000000
******** c_param = 0.10 ********
Iteration 1: recall score = 0.694915
Iteration 2: recall score = 0.683544
Iteration 3: recall score = 0.681159
Iteration 4: recall score = 0.583333
Iteration 5: recall score = 0.698413
Mean recall score 0.668273
******** c_param = 1.00 ********
Iteration 1: recall score = 0.745763
Iteration 2: recall score = 0.708861
Iteration 3: recall score = 0.710145
Iteration 4: recall score = 0.597222
Iteration 5: recall score = 0.746032
Mean recall score 0.701604
******** c_param = 10.00 ********
Iteration 1: recall score = 0.745763
Iteration 2: recall score = 0.708861
Iteration 3: recall score = 0.739130
Iteration 4: recall score = 0.597222
Iteration 5: recall score = 0.761905
Mean recall score 0.710576
******** c_param = 100.00 ********
Iteration 1: recall score = 0.745763
Iteration 2: recall score = 0.708861
Iteration 3: recall score = 0.739130
Iteration 4: recall score = 0.597222
Iteration 5: recall score = 0.761905
Mean recall score 0.710576
--------------------------------
best_c = 10.00
c:\python27\lib\site-packages\sklearn\utils\validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
c:\python27\lib\site-packages\ipykernel_launcher.py:25: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
import itertools
lr = LogisticRegression(C = best_c, penalty ='l1')
lr.fit(X.values, Y.values)
Y_hat = lr.predict(X.values)# Compute confusion matrix
cnf_matrix = confusion_matrix(Y, Y_hat)#np.set_printoptions(precision=2)print"Recall value in training dataset: %f"%(1.0*cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))# Plot non-normalized confusion matrix
class_names =[0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix')
plt.show()
Recall value in training dataset: 0.710526
lr = LogisticRegression(C = best_c, penalty ='l1')
lr.fit(X.values, Y.values)
Y_hat_proba = lr.predict_proba(X.values)
thresholds =[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
plt.figure(figsize=(10,10))
j =1for i in thresholds:
Y_hat = Y_hat_proba[:,1]> i
plt.subplot(3,3,j)
j +=1# Compute confusion matrix
cnf_matrix = confusion_matrix(Y, Y_hat)print"Recall value in training dataset: %f, with threshold = %.1f"%((1.0*cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1])), i)# Plot non-normalized confusion matrix
class_names =[0,1]
plot_confusion_matrix(cnf_matrix
, classes=class_names
, title='Threshold >= %s'%i)
Recall value in training dataset: 0.938596, with threshold = 0.1
Recall value in training dataset: 0.850877, with threshold = 0.2
Recall value in training dataset: 0.824561, with threshold = 0.3
Recall value in training dataset: 0.757310, with threshold = 0.4
Recall value in training dataset: 0.710526, with threshold = 0.5
Recall value in training dataset: 0.646199, with threshold = 0.6
Recall value in training dataset: 0.532164, with threshold = 0.7
Recall value in training dataset: 0.371345, with threshold = 0.8
Recall value in training dataset: 0.204678, with threshold = 0.9
Testing
test_data = pd.read_csv('test.csv')
test_data['Sex']= test_data['Sex'].map({'female':0,'male':1})
age_avg = np.mean([0if np.isnan(item)else item for item in test_data['Age']])
test_data['Age']=[age_avg if np.isnan(item)else item for item in test_data['Age']]
test_data['Age']= StandardScaler().fit_transform(test_data['Age'].values.reshape(-1,1))
test_data['SibSp']= StandardScaler().fit_transform(test_data['SibSp'].values.reshape(-1,1))
test_data['Parch']= StandardScaler().fit_transform(test_data['Parch'].values.reshape(-1,1))
fare_avg = np.mean([0if np.isnan(item)else item for item in test_data['Fare']])
test_data['Fare']=[fare_avg if np.isnan(item)else item for item in test_data['Fare']]
test_data['Fare']= StandardScaler().fit_transform(test_data['Fare'].values.reshape(-1,1))
test_data['Embarked']= test_data['Embarked'].map({'S':1,'C':2,'Q':3})
pier =[0if np.isnan(item)else item for item in test_data['Embarked']]
test_data['Embarked']=[max(set(pier), key=pier.count)if item ==0else item for item in pier]
test_data = test_data.drop(columns=['Name','Ticket','Cabin'])
test_data.head()
PassengerId
Pclass
Sex
Age
SibSp
Parch
Fare
Embarked
0
892
3
1
0.428099
-0.499470
-0.400248
-0.498403
3
1
893
3
0
1.399492
0.616992
-0.400248
-0.513271
1
2
894
2
1
2.565163
-0.499470
-0.400248
-0.465085
3
3
895
3
1
-0.154736
-0.499470
-0.400248
-0.483463
1
4
896
3
0
-0.543293
0.616992
0.619896
-0.418468
1
train_data.head()
Survived
Pclass
Sex
Age
SibSp
Parch
Fare
Embarked
0
0
3
1
-0.494245
0.432793
-0.473674
-0.502445
1.0
1
1
1
0
0.717307
0.432793
-0.473674
0.786845
2.0
2
1
3
0
-0.191357
-0.474545
-0.473674
-0.488854
1.0
3
1
1
0
0.490141
0.432793
-0.473674
0.420730
1.0
4
0
3
1
0.490141
-0.474545
-0.473674
-0.486337
1.0
X_test = test_data.drop(['PassengerId'], axis=1)
X_test.head()
Pclass
Sex
Age
SibSp
Parch
Fare
Embarked
0
3
1
0.428099
-0.499470
-0.400248
-0.498403
3
1
3
0
1.399492
0.616992
-0.400248
-0.513271
1
2
2
1
2.565163
-0.499470
-0.400248
-0.465085
3
3
3
1
-0.154736
-0.499470
-0.400248
-0.483463
1
4
3
0
-0.543293
0.616992
0.619896
-0.418468
1
lr = LogisticRegression(C = best_c, penalty ='l1')
lr.fit(X.values, Y.values)
Y_hat_proba = lr.predict_proba(X_test.values)
Y_hat =[1if y >0.6else0for y in Y_hat_proba[:,1]]
results = pd.DataFrame(Y_hat, columns=['Survived'])
results.insert(0,'PassengerId', test_data['PassengerId'])
results.to_csv('results.csv')
数据来源:https://www.kaggle.com/c/titanicTrainingimport pandas as pdimport matplotlib.pyplot as pltimport numpy as np%matplotlib inlinetrain_data = pd.read_csv('train.csv')count_survivors = pd.value_counts(train_data['Survived'])count_survivors.plot(