In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from collections import Counter
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,\
ExtraTreesClassifier,VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
In [2]:
train = pd.read_csv("C:/Code/Kaggle/Titanic/train.csv")
test = pd.read_csv("C:/Code/Kaggle/Titanic/test.csv")
IDtest = test["PassengerId"]
In [3]:
def detect_outliers(df,n,features):
outlier_indices = []
for col in features:
Q1 = np.percentile(df[col],25)
Q3 = np.percentile(df[col],75)
IQR = Q3 - Q1
outlier_step = 1.5 * IQR
outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step)].index
outlier_indices.extend(outlier_list_col)
outlier_indices = Counter(outlier_indices)
multiple_outliers = list(k for k, v in outlier_indices.items() if v>n)
return multiple_outliers
Outliers_to_drop = detect_outliers(train,2,["Age","SibSp","Parch","Fare"])
In [4]:
train.loc[Outliers_to_drop]
Out[4]:
In [5]:
train = train.drop(Outliers_to_drop,axis=0).reset_index(drop=True)
In [6]:
train_len = len(train)
dataset = pd.concat([train,test], axis=0).reset_index(drop=True)
dataset.tail()
Out[6]:
In [7]:
#dataset = dataset.fillna(np.nan)
dataset.isnull().sum()
Out[7]:
In [8]:
train.info()
train.isnull().sum()
Out[8]:
In [9]:
train.describe()
Out[9]:
In [10]:
g = sns.heatmap(train[["Survived","SibSp","Parch","Age","Fare"]].corr(), annot=True, fmt=".2f", cmap = "coolwarm")
In [11]:
g = sns.factorplot(x="SibSp",y="Survived",data=train,kind="bar")
g = g.set_ylabels("survival probability")
In [12]:
g = sns.factorplot(x="Parch",y="Survived",data=train,kind="bar")
g = g.set_ylabels("survival probability")
In [13]:
g = sns.kdeplot(train["Age"][(train["Survived"] == 0) & (train["Age"].notnull())], color="Red", shade = True)
g = sns.kdeplot(train["Age"][(train["Survived"] == 1) & (train["Age"].notnull())], ax =g, color="Blue", shade= True)
g.set_xlabel("Age")
g.set_ylabel("Frequency")
g = g.legend(["Not Survived","Survived"])