import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("titanic_train.csv")
titanic_survival.head()
print(titanic_survival.shape)
(891, 12)
# The Pandas library uses NaN, which stands for "not a number", to indicate a missing value.
# We can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values
age = titanic_survival["Age"]
print(age.loc[0:10])
age_is_null = pd.isnull(age)
print(age_is_null)
age_null_true = age[age_is_null]
print(age_null_true)
print(len(age_null_true))
0 22.0
1 38.0
2 26.0
3 35.0
4 35.0
5 NaN
6 54.0
7 2.0
8 27.0
9 14.0
10 4.0
Name: Age, dtype: float64
0 False
1 False
2 False
3 False
4 False
5 True
6 False
7 False
8 False
9 False
10 False
11 False
12 False
13 False
14 False
15 False
16 False
17 True
18 False
19 True
20 False
21 False
22 False
23 False
24 False
25 False
26 True
27 False
28 True
29 True
...
861 False
862 False
863 True
864 False
865 False
866 False
867 False
868 True
869 False
870 False
871 False
872 False
873 False
874 False
875 False
876 False
877 False
878 True
879 False
880 False
881 False
882 False
883 False
884 False
885 False
886 False
887 False
888 True
889 False
890 False
Name: Age, Length: 891, dtype: bool
5 NaN
17 NaN
19 NaN
26 NaN
28 NaN
29 NaN
31 NaN
32 NaN
36 NaN
42 NaN
45 NaN
46 NaN
47 NaN
48 NaN
55 NaN
64 NaN
65 NaN
76 NaN
77 NaN
82 NaN
87 NaN
95 NaN
101 NaN
107 NaN
109 NaN
121 NaN
126 NaN
128 NaN
140 NaN
154 NaN
..
718 NaN
727 NaN
732 NaN
738 NaN
739 NaN
740 NaN
760 NaN
766 NaN
768 NaN
773 NaN
776 NaN
778 NaN
783 NaN
790 NaN
792 NaN
793 NaN
815 NaN
825 NaN
826 NaN
828 NaN
832 NaN
837 NaN
839 NaN
846 NaN
849 NaN
859 NaN
863 NaN
868 NaN
878 NaN
888 NaN
Name: Age, Length: 177, dtype: float64
177
# The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value
mean_age = sum(titanic_survival["Age"])/len(titanic_survival["Age"])
print(mean_age)
nan
# We have to filter out the missing values before we calculate the mean.
good_ages = titanic_survival["Age"][age_is_null==False]
correct_mean_age = sum(good_ages)/len(good_ages)
print(correct_mean_age)
29.69911764705882
# Missing data is so common that many pandas methods automatically filter for it
correct_mean_age = titanic_survival["Age"].mean()
print(correct_mean_age)
29.69911764705882
# Mean fare for each class
passenger_classes = [1,2,3]
fares_by_class = {}
for this_class in passenger_classes:
pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
pclass_fares = pclass_rows["Fare"]
fare_for_class = pclass_fares.mean()
fares_by_class[this_class] = fare_for_class
print(fares_by_class)
{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}
# index tells the method which column to group by values is the column that we want to apply
# the calculation to aggrunc specifies the calculation we want to perform
passenger_survival = titanic_survival.pivot_table(index="Pclass",values="Survived", aggfunc=np.mean)
print(passenger_survival)
Survived
Pclass
1 0.629630
2 0.472826
3 0.242363
passenger_age = titanic_survival.pivot_table(index="Pclass", values="Age")
print(passenger_age)
Age
Pclass
1 38.233441
2 29.877630
3 25.140620
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)
print(port_stats)
Fare Survived
Embarked
C 10072.2962 93
Q 1022.2543 30
S 17439.3988 217
#specifying axis=1 or axis='columns' will drop any columns that have null values
print(titanic_survival.shape)
drop_na_columns = titanic_survival.dropna(axis=1)
print(drop_na_columns.shape)
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age", "Sex"])
print(new_titanic_survival)
(891, 12)
(891, 9)
PassengerId Survived Pclass \
0 1 0 3
1 2 1 1
2 3 1 3
3 4 1 1
4 5 0 3
6 7 0 1
7 8 0 3
8 9 1 3
9 10 1 2
10 11 1 3
11 12 1 1
12 13 0 3
13 14 0 3
14 15 0 3
15 16 1 2
16 17 0 3
18 19 0 3
20 21 0 2
21 22 1 2
22 23 1 3
23 24 1 1
24 25 0 3
25 26 1 3
27 28 0 1
30 31 0 1
33 34 0 2
34 35 0 1
35 36 0 1
37 38 0 3
38 39 0 3
.. ... ... ...
856 857 1 1
857 858 1 1
858 859 1 3
860 861 0 3
861 862 0 2
862 863 1 1
864 865 0 2
865 866 1 2
866 867 1 2
867 868 0 1
869 870 1 3
870 871 0 3
871 872 1 1
872 873 0 1
873 874 0 3
874 875 1 2
875 876 1 3
876 877 0 3
877 878 0 3
879 880 1 1
880 881 1 2
881 882 0 3
882 883 0 3
883 884 0 2
884 885 0 3
885 886 0 3
886 887 0 2
887 888 1 1
889 890 1 1
890 891 0 3
Name Sex Age SibSp \
0 Braund, Mr. Owen Harris male 22.0 1
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1
2 Heikkinen, Miss. Laina female 26.0 0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1
4 Allen, Mr. William Henry male 35.0 0
6 McCarthy, Mr. Timothy J male 54.0 0
7 Palsson, Master. Gosta Leonard male 2.0 3
8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0
9 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1
10 Sandstrom, Miss. Marguerite Rut female 4.0 1
11 Bonnell, Miss. Elizabeth female 58.0 0
12 Saundercock, Mr. William Henry male 20.0 0
13 Andersson, Mr. Anders Johan male 39.0 1
14 Vestrom, Miss. Hulda Amanda Adolfina female 14.0 0
15 Hewlett, Mrs. (Mary D Kingcome) female 55.0 0
16 Rice, Master. Eugene male 2.0 4
18 Vander Planke, Mrs. Julius (Emelia Maria Vande... female 31.0 1
20 Fynney, Mr. Joseph J male 35.0 0
21 Beesley, Mr. Lawrence male 34.0 0
22 McGowan, Miss. Anna "Annie" female 15.0 0
23 Sloper, Mr. William Thompson male 28.0 0
24 Palsson, Miss. Torborg Danira female 8.0 3
25 Asplund, Mrs. Carl Oscar (Selma Augusta Emilia... female 38.0 1
27 Fortune, Mr. Charles Alexander male 19.0 3
30 Uruchurtu, Don. Manuel E male 40.0 0
33 Wheadon, Mr. Edward H male 66.0 0
34 Meyer, Mr. Edgar Joseph male 28.0 1
35 Holverson, Mr. Alexander Oskar male 42.0 1
37 Cann, Mr. Ernest Charles male 21.0 0
38 Vander Planke, Miss. Augusta Maria female 18.0 2
.. ... ... ... ...
856 Wick, Mrs. George Dennick (Mary Hitchcock) female 45.0 1
857 Daly, Mr. Peter Denis male 51.0 0
858 Baclini, Mrs. Solomon (Latifa Qurban) female 24.0 0
860 Hansen, Mr. Claus Peter male 41.0 2
861 Giles, Mr. Frederick Edward male 21.0 1
862 Swift, Mrs. Frederick Joel (Margaret Welles Ba... female 48.0 0
864 Gill, Mr. John William male 24.0 0
865 Bystrom, Mrs. (Karolina) female 42.0 0
866 Duran y More, Miss. Asuncion female 27.0 1
867 Roebling, Mr. Washington Augustus II male 31.0 0
869 Johnson, Master. Harold Theodor male 4.0 1
870 Balkic, Mr. Cerin male 26.0 0
871 Beckwith, Mrs. Richard Leonard (Sallie Monypeny) female 47.0 1
872 Carlsson, Mr. Frans Olof male 33.0 0
873 Vander Cruyssen, Mr. Victor male 47.0 0
874 Abelson, Mrs. Samuel (Hannah Wizosky) female 28.0 1
875 Najib, Miss. Adele Kiamie "Jane" female 15.0 0
876 Gustafsson, Mr. Alfred Ossian male 20.0 0
877 Petroff, Mr. Nedelio male 19.0 0
879 Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) female 56.0 0
880 Shelley, Mrs. William (Imanita Parrish Hall) female 25.0 0
881 Markun, Mr. Johann male 33.0 0
882 Dahlberg, Miss. Gerda Ulrika female 22.0 0
883 Banfield, Mr. Frederick James male 28.0 0
884 Sutehall, Mr. Henry Jr male 25.0 0
885 Rice, Mrs. William (Margaret Norton) female 39.0 0
886 Montvila, Rev. Juozas male 27.0 0
887 Graham, Miss. Margaret Edith female 19.0 0
889 Behr, Mr. Karl Howell male 26.0 0
890 Dooley, Mr. Patrick male 32.0 0
Parch Ticket Fare Cabin Embarked
0 0 A/5 21171 7.2500 NaN S
1 0 PC 17599 71.2833 C85 C
2 0 STON/O2. 3101282 7.9250 NaN S
3 0 113803 53.1000 C123 S
4 0 373450 8.0500 NaN S
6 0 17463 51.8625 E46 S
7 1 349909 21.0750 NaN S
8 2 347742 11.1333 NaN S
9 0 237736 30.0708 NaN C
10 1 PP 9549 16.7000 G6 S
11 0 113783 26.5500 C103 S
12 0 A/5. 2151 8.0500 NaN S
13 5 347082 31.2750 NaN S
14 0 350406 7.8542 NaN S
15 0 248706 16.0000 NaN S
16 1 382652 29.1250 NaN Q
18 0 345763 18.0000 NaN S
20 0 239865 26.0000 NaN S
21 0 248698 13.0000 D56 S
22 0 330923 8.0292 NaN Q
23 0 113788 35.5000 A6 S
24 1 349909 21.0750 NaN S
25 5 347077 31.3875 NaN S
27 2 19950 263.0000 C23 C25 C27 S
30 0 PC 17601 27.7208 NaN C
33 0 C.A. 24579 10.5000 NaN S
34 0 PC 17604 82.1708 NaN C
35 0 113789 52.0000 NaN S
37 0 A./5. 2152 8.0500 NaN S
38 0 345764 18.0000 NaN S
.. ... ... ... ... ...
856 1 36928 164.8667 NaN S
857 0 113055 26.5500 E17 S
858 3 2666 19.2583 NaN C
860 0 350026 14.1083 NaN S
861 0 28134 11.5000 NaN S
862 0 17466 25.9292 D17 S
864 0 233866 13.0000 NaN S
865 0 236852 13.0000 NaN S
866 0 SC/PARIS 2149 13.8583 NaN C
867 0 PC 17590 50.4958 A24 S
869 1 347742 11.1333 NaN S
870 0 349248 7.8958 NaN S
871 1 11751 52.5542 D35 S
872 0 695 5.0000 B51 B53 B55 S
873 0 345765 9.0000 NaN S
874 0 P/PP 3381 24.0000 NaN C
875 0 2667 7.2250 NaN C
876 0 7534 9.8458 NaN S
877 0 349212 7.8958 NaN S
879 1 11767 83.1583 C50 C
880 1 230433 26.0000 NaN S
881 0 349257 7.8958 NaN S
882 0 7552 10.5167 NaN S
883 0 C.A./SOTON 34068 10.5000 NaN S
884 0 SOTON/OQ 392076 7.0500 NaN S
885 5 382652 29.1250 NaN Q
886 0 211536 13.0000 NaN S
887 0 112053 30.0000 B42 S
889 0 111369 30.0000 C148 C
890 0 370376 7.7500 NaN Q
[714 rows x 12 columns]
row_index_83_age = titanic_survival.loc[83,"Age"]
row_index_1000_pclass = titanic_survival.loc[766,"Pclass"]
print(row_index_83_age)
print(row_index_1000_pclass)
print(titanic_survival.loc[83])
28.0
1
PassengerId 84
Survived 0
Pclass 1
Name Carrau, Mr. Francisco M
Sex male
Age 28
SibSp 0
Parch 0
Ticket 113059
Fare 47.1
Cabin NaN
Embarked S
Name: 83, dtype: object
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
print(new_titanic_survival[0:10])
titanic_reindexed = new_titanic_survival.reset_index(drop=True)
print(titanic_reindexed.loc[0:10])
PassengerId Survived Pclass Name \
630 631 1 1 Barkworth, Mr. Algernon Henry Wilson
851 852 0 3 Svensson, Mr. Johan
493 494 0 1 Artagaveytia, Mr. Ramon
96 97 0 1 Goldschmidt, Mr. George B
116 117 0 3 Connors, Mr. Patrick
672 673 0 2 Mitchell, Mr. Henry Michael
745 746 0 1 Crosby, Capt. Edward Gifford
33 34 0 2 Wheadon, Mr. Edward H
54 55 0 1 Ostby, Mr. Engelhart Cornelius
280 281 0 3 Duane, Mr. Frank
Sex Age SibSp Parch Ticket Fare Cabin Embarked
630 male 80.0 0 0 27042 30.0000 A23 S
851 male 74.0 0 0 347060 7.7750 NaN S
493 male 71.0 0 0 PC 17609 49.5042 NaN C
96 male 71.0 0 0 PC 17754 34.6542 A5 C
116 male 70.5 0 0 370369 7.7500 NaN Q
672 male 70.0 0 0 C.A. 24580 10.5000 NaN S
745 male 70.0 1 1 WE/P 5735 71.0000 B22 S
33 male 66.0 0 0 C.A. 24579 10.5000 NaN S
54 male 65.0 0 1 113509 61.9792 B30 C
280 male 65.0 0 0 336439 7.7500 NaN Q
PassengerId Survived Pclass Name Sex \
0 631 1 1 Barkworth, Mr. Algernon Henry Wilson male
1 852 0 3 Svensson, Mr. Johan male
2 494 0 1 Artagaveytia, Mr. Ramon male
3 97 0 1 Goldschmidt, Mr. George B male
4 117 0 3 Connors, Mr. Patrick male
5 673 0 2 Mitchell, Mr. Henry Michael male
6 746 0 1 Crosby, Capt. Edward Gifford male
7 34 0 2 Wheadon, Mr. Edward H male
8 55 0 1 Ostby, Mr. Engelhart Cornelius male
9 281 0 3 Duane, Mr. Frank male
10 457 0 1 Millet, Mr. Francis Davis male
Age SibSp Parch Ticket Fare Cabin Embarked
0 80.0 0 0 27042 30.0000 A23 S
1 74.0 0 0 347060 7.7750 NaN S
2 71.0 0 0 PC 17609 49.5042 NaN C
3 71.0 0 0 PC 17754 34.6542 A5 C
4 70.5 0 0 370369 7.7500 NaN Q
5 70.0 0 0 C.A. 24580 10.5000 NaN S
6 70.0 1 1 WE/P 5735 71.0000 B22 S
7 66.0 0 0 C.A. 24579 10.5000 NaN S
8 65.0 0 1 113509 61.9792 B30 C
9 65.0 0 0 336439 7.7500 NaN Q
10 65.0 0 0 13509 26.5500 E38 S
# This function returns the hundredth item from a series
def hundredth_row(column):
# Extract the hundredth item
hundreth_item = column.iloc[99]
return hundreth_item
# Return the hundredth item from each column
hundreth_row = titanic_survival.apply(hundredth_row)
print(hundreth_row)
PassengerId 100
Survived 0
Pclass 2
Name Kantor, Mr. Sinai
Sex male
Age 34
SibSp 1
Parch 0
Ticket 244367
Fare 26
Cabin NaN
Embarked S
dtype: object
def not_null_count(column):
column_null = pd.isnull(column)
null = column[column_null]
return len(null)
column_null_count = titanic_survival.apply(not_null_count)
print(column_null_count)
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
# By passing in the axis=1 argument, we can use the DataFrame.apply() method to iterate over rows instead of columns.
def which_class(row):
pclass = row['Pclass']
if pd.isnull(pclass):
return "Unknown"
elif pclass == 1:
return "First Class"
elif pclass == 2:
return "Second Class"
elif pclass == 3:
return "Third Class"
classes = titanic_survival.apply(which_class,axis=1)
print(classes)
0 Third Class
1 First Class
2 Third Class
3 First Class
4 Third Class
5 Third Class
6 First Class
7 Third Class
8 Third Class
9 Second Class
10 Third Class
11 First Class
12 Third Class
13 Third Class
14 Third Class
15 Second Class
16 Third Class
17 Second Class
18 Third Class
19 Third Class
20 Second Class
21 Second Class
22 Third Class
23 First Class
24 Third Class
25 Third Class
26 Third Class
27 First Class
28 Third Class
29 Third Class
...
861 Second Class
862 First Class
863 Third Class
864 Second Class
865 Second Class
866 Second Class
867 First Class
868 Third Class
869 Third Class
870 Third Class
871 First Class
872 First Class
873 Third Class
874 Second Class
875 Third Class
876 Third Class
877 Third Class
878 Third Class
879 First Class
880 Second Class
881 Third Class
882 Third Class
883 Second Class
884 Third Class
885 Third Class
886 Second Class
887 First Class
888 Third Class
889 First Class
890 Third Class
Length: 891, dtype: object
def is_minor(row):
if row["Age"]<18:
return True
else:
return False
minors = titanic_survival.apply(is_minor,axis=1)
print(minors)
def generate_age_label(row):
age = row["Age"]
if pd.isnull(age):
return "unknown"
elif age<18:
return "minor"
else:
return "adult"
age_labels = titanic_survival.apply(generate_age_label,axis=1)
print(age_labels)
0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 True
8 False
9 True
10 True
11 False
12 False
13 False
14 True
15 False
16 True
17 False
18 False
19 False
20 False
21 False
22 True
23 False
24 True
25 False
26 False
27 False
28 False
29 False
...
861 False
862 False
863 False
864 False
865 False
866 False
867 False
868 False
869 True
870 False
871 False
872 False
873 False
874 False
875 True
876 False
877 False
878 False
879 False
880 False
881 False
882 False
883 False
884 False
885 False
886 False
887 False
888 False
889 False
890 False
Length: 891, dtype: bool
0 adult
1 adult
2 adult
3 adult
4 adult
5 unknown
6 adult
7 minor
8 adult
9 minor
10 minor
11 adult
12 adult
13 adult
14 minor
15 adult
16 minor
17 unknown
18 adult
19 unknown
20 adult
21 adult
22 minor
23 adult
24 minor
25 adult
26 unknown
27 adult
28 unknown
29 unknown
...
861 adult
862 adult
863 unknown
864 adult
865 adult
866 adult
867 adult
868 unknown
869 minor
870 adult
871 adult
872 adult
873 adult
874 adult
875 minor
876 adult
877 adult
878 unknown
879 adult
880 adult
881 adult
882 adult
883 adult
884 adult
885 adult
886 adult
887 adult
888 unknown
889 adult
890 adult
Length: 891, dtype: object
titanic_survival['age_labels'] = age_labels
age_group_survival = titanic_survival.pivot_table(index="age_labels",values="Survived")
print(age_group_survival)
Survived
age_labels
adult 0.381032
minor 0.539823
unknown 0.293785