泰坦尼克号项目

import pandas as pd
df_train,df_test = pd.read_csv("F:/Python CODE/Kaggle_Titanic/train.csv"),pd.read_csv("F:/Python CODE/Kaggle_Titanic/test.csv") 
In [2]:
df_train.head()#查看表格的后5行
Out[2]:
 PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
 

SibSp -- 同船配偶以及兄弟姐妹的人数

Parch -- 同船父母或者子女的人数

Ticket -- 船票

Fare -- 票价

Cabin -- 舱位

Embarked -- 登船港口

In [3]:
df_train.info()  #查看数据表的整体信息
 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
In [4]:
df_train.describe() #描述性统计
Out[4]:
 PassengerIdSurvivedPclassAgeSibSpParchFare
count891.000000891.000000891.000000714.000000891.000000891.000000891.000000
mean446.0000000.3838382.30864229.6991180.5230080.38159432.204208
std257.3538420.4865920.83607114.5264971.1027430.80605749.693429
min1.0000000.0000001.0000000.4200000.0000000.0000000.000000
25%223.5000000.0000002.00000020.1250000.0000000.0000007.910400
50%446.0000000.0000003.00000028.0000000.0000000.00000014.454200
75%668.5000001.0000003.00000038.0000001.0000000.00000031.000000
max891.0000001.0000003.00000080.0000008.0000006.000000512.329200
In [5]:
df_train[["Name","Sex","Ticket","Cabin","Embarked"]].describe()#对于object类型(python对象)同样用describe()处理 
Out[5]:
 NameSexTicketCabinEmbarked
count891891891204889
unique89126811473
topGreen, Mr. George HenrymaleCA. 2343G6S
freq157774644
In [6]:
#特征分析,在11个特征中,找哪些是和幸存相关
import numpy as np
import matplotlib.pyplot as plt Pclass_Survied = pd.crosstab(df_train['Pclass'],df_train['Survived'])#生成Pclass_Survied的列联表 
In [7]:
Pclass_Survied
Out[7]:
Survived01
Pclass  
180136
29787
3372119
In [8]:
Pclass_Survied.plot(kind = 'bar',stacked = True) #堆积柱形图 plt.show() 
 
In [9]:
Pclass_Survied.count()
Out[9]:
Survived
0    3
1    3
dtype: int64
In [10]:
Pclass_Survied.index
Out[10]:
Int64Index([1, 2, 3], dtype='int64', name='Pclass')
In [11]:
Survied_len = len(Pclass_Survied.count()) Pclass_index = np.arange(len(Pclass_Survied.index)) 
In [12]:
Pclass_index
Out[12]:
array([0, 1, 2])
In [13]:
Pclass_Survied
Out[13]:
Survived01
Pclass  
180136
29787
3372119
In [14]:
Pclass_Survied.plot(kind = 'bar',stacked = True) #堆积柱形图 Sum1 = 0 for i in range(Survied_len): SurvivedName = Pclass_Survied.columns[i] PclassCount = Pclass_Survied[SurvivedName] Sum1,Sum2 = Sum1+PclassCount,Sum1 Zsum =Sum2+(Sum1 - Sum2)/2 for x,y,z in zip(Pclass_index,PclassCount,Zsum): plt.text(x,z, '%.0f'%y, ha = 'center',va='center' )#添加数据标签 #修改x轴标签 plt.xticks(Pclass_Survied.index-1, Pclass_Survied.index, rotation=360) plt.title('Survived status by pclass') plt.show() 
 
In [15]:
a = df_train.Pclass[df_train['Survived']==0].value_counts() b = df_train.Pclass[df_train['Survived']==1].value_counts() Pclass_Survived = pd.DataFrame({ 0: a, 1: b}) 
In [16]:
Pclass_Survived
Out[16]:
 01
180136
29787
3372119
In [17]:
import re
df_train['Appellation'] = df_train.Name.apply(lambda x: re.search('\w+\.', x).group()).str.replace('.', '') df_train.Appellation.unique() 
Out[17]:
array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'Countess',
       'Jonkheer'], dtype=object)
In [18]:
Application_Sex = pd.crosstab(df_train.Sex,df_train.Appellation) Application_Sex 
Out[18]:
AppellationCaptColCountessDonDrJonkheerLadyMajorMasterMissMlleMmeMrMrsMsRevSir
Sex                 
female001010100182210125100
male12016102400005170061
In [19]:
df_train['Appellation'] = df_train['Appellation'].replace(['Capt','Col','Countess','Don','Dr','Jonkheer','Lady','Major','Rev','Sir'], 'Rare') df_train['Appellation'] = df_train['Appellation'].replace(['Mlle','Ms'], 'Miss') df_train['Appellation'] = df_train['Appellation'].replace('Mme', 'Mrs') df_train.Appellation.unique() 
Out[19]:
array(['Mr', 'Mrs', 'Miss', 'Master', 'Rare'], dtype=object)
In [44]:
Appellation_Survived = pd.crosstab(df_train['Appellation'], df_train['Survived']) Appellation_Survived.plot(kind = 'bar') plt.xticks(np.arange(len(Appellation_Survived.index)), Appellation_Survived.index, rotation = 360) plt.title('Survived status by Appellation') plt.show() 
 
 
 
 
 
 
 
In [24]:
Sex_Survived = pd.crosstab(df_train['Sex'],df_train['Survived']) 
In [45]:
#生成列联表
Sex_Survived = pd.crosstab(df_train['Sex'], df_train['Survived']) Survived_len = len(Sex_Survived.count()) Sex_index = np.arange(len(Sex_Survived.index)) single_width = 0.35 for i in range(Survived_len): SurvivedName = Sex_Survived.columns[i] SexCount = Sex_Survived[SurvivedName] SexLocation = Sex_index * 1.05 + (i - 1/2)*single_width #绘制柱形图 plt.bar(SexLocation, SexCount, width = single_width) for x, y in zip(SexLocation, SexCount): #添加数据标签 plt.text(x, y, '%.0f'%y, ha='center', va='bottom') index = Sex_index * 1.05 plt.xticks(index, Sex_Survived.index, rotation=360) plt.title('Survived status by sex') plt.show() 
 
In [46]:
SibSp_Survived = pd.crosstab(df_train['SibSp'], df_train['Survived']) SibSp_Survived.plot(kind = 'bar') plt.xticks(SibSp_Survived.index,SibSp_Survived.index,rotation = 360) plt.title('Survived status by SibSp') plt.show() 
 
In [47]:
SibSp_Survived = pd.crosstab(df_train.SibSp[df_train['SibSp']>2], df_train['Survived']) SibSp_Survived.plot(kind = 'bar') plt.xticks([0,1,2,3],SibSp_Survived.index,rotation = 360) plt.title('Survived status by SibSp') plt.show() 
 
In [28]:
Ticket_Count =  df_train.groupby('Ticket',as_index=False)['PassengerId'].count() 
In [29]:
Ticket_Count.head()
Out[29]:
 TicketPassengerId
01101523
11104133
21104652
31105641
41108131
In [30]:
#解释上行代码中的groupg中的as_index=False
df = pd.DataFrame(data={'books':['bk1','bk1','bk1','bk2','bk2','bk3'], 'price': [12,12,12,15,15,17]}) print(df) print("*********************") print (df.groupby('books', as_index=True).sum()) print("*********************") print (df.groupby('books', as_index=False).sum()) 
 
  books  price
0   bk1     12
1   bk1     12
2   bk1     12
3   bk2     15
4   bk2     15
5   bk3     17
*********************
       price
books       
bk1       36
bk2       30
bk3       17
*********************
  books  price
0   bk1     36
1   bk2     30
2   bk3     17
In [31]:
Ticket_Count_0 = Ticket_Count[Ticket_Count.PassengerId == 1]['Ticket'] 
In [32]:
Ticket_Count_0.head()
Out[32]:
3    110564
4    110813
5    111240
6    111320
8    111369
Name: Ticket, dtype: object
In [33]:
df_train['GroupTicket'] = np.where(df_train.Ticket.isin(Ticket_Count_0),0,1) 
In [34]:
GroupTicket_Survived = pd.crosstab(df_train['GroupTicket'],df_train['Survived']) GroupTicket_Survived.plot(kind='bar') plt.xticks(rotation =360) 
Out[34]:
(array([0, 1]), <a list of 2 Text xticklabel objects>)
In [35]:
bins = [0, 60, 120, 180, 240, 300, 360, 420, 480, 540, 600] df_train['GroupFare'] = pd.cut(df_train.Fare,bins,right=False) GroupFare_Survived = pd.crosstab(df_train['GroupFare'],df_train['Survived']) GroupFare_Survived.plot(kind = 'bar') 
Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0xac47eb8>
In [36]:
GroupFare_Survived.iloc[2:].plot(kind = 'bar') 
Out[36]:
<matplotlib.axes._subplots.AxesSubplot at 0xa7a4ef0>
In [ ]:
#以上所有操作都是对特征中无缺失部分进行分析
#下一步则会在特征工程中对缺失部分进行处理Age、Cabin、Embarked
In [37]:
df_train['Embarked'].mode() 
Out[37]:
0    S
dtype: object
In [38]:
#df_train['Embarked'].mode()[0]  众数可能有多个,[0]代表取第一个
train = df_train.copy() train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0]) 
In [39]:
train['Cabin'] = train['Cabin'].fillna('NO') 
In [40]:
Age_Appellation_median = train.groupby('Appellation')['Age'].median() 
In [52]:
Age_Appellation_median
Out[52]:
Appellation
Master     3.5
Miss      21.0
Mr        30.0
Mrs       35.0
Rare      48.5
Name: Age, dtype: float64
In [59]:
train.set_index('Appellation', inplace = True) #在当前表填充缺失值 train.Age.fillna(Age_Appellation_median, inplace = True) #重置索引 train.reset_index(inplace = True) 
In [60]:
train
Out[60]:
 AppellationPassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedGroupTicketGroupFare
0Mr103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NOS0[0, 60)
1Mrs211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C0[60, 120)
2Miss313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NOS0[0, 60)
3Mrs411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S1[0, 60)
4Mr503Allen, Mr. William Henrymale35.0003734508.0500NOS0[0, 60)
5Mr603Moran, Mr. Jamesmale30.0003308778.4583NOQ0[0, 60)
6Mr701McCarthy, Mr. Timothy Jmale54.0001746351.8625E46S0[0, 60)
7Master803Palsson, Master. Gosta Leonardmale2.03134990921.0750NOS1[0, 60)
8Mrs913Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)female27.00234774211.1333NOS1[0, 60)
9Mrs1012Nasser, Mrs. Nicholas (Adele Achem)female14.01023773630.0708NOC1[0, 60)
10Miss1113Sandstrom, Miss. Marguerite Rutfemale4.011PP 954916.7000G6S1[0, 60)
11Miss1211Bonnell, Miss. Elizabethfemale58.00011378326.5500C103S0[0, 60)
12Mr1303Saundercock, Mr. William Henrymale20.000A/5. 21518.0500NOS0[0, 60)
13Mr1403Andersson, Mr. Anders Johanmale39.01534708231.2750NOS1[0, 60)
14Miss1503Vestrom, Miss. Hulda Amanda Adolfinafemale14.0003504067.8542NOS0[0, 60)
15Mrs1612Hewlett, Mrs. (Mary D Kingcome)female55.00024870616.0000NOS0[0, 60)
16Master1703Rice, Master. Eugenemale2.04138265229.1250NOQ1[0, 60)
17Mr1812Williams, Mr. Charles Eugenemale30.00024437313.0000NOS0[0, 60)
18Mrs1903Vander Planke, Mrs. Julius (Emelia Maria Vande...female31.01034576318.0000NOS0[0, 60)
19Mrs2013Masselmani, Mrs. Fatimafemale35.00026497.2250NOC0[0, 60)
20Mr2102Fynney, Mr. Joseph Jmale35.00023986526.0000NOS1[0, 60)
21Mr2212Beesley, Mr. Lawrencemale34.00024869813.0000D56S0[0, 60)
22Miss2313McGowan, Miss. Anna "Annie"female15.0003309238.0292NOQ0[0, 60)
23Mr2411Sloper, Mr. William Thompsonmale28.00011378835.5000A6S0[0, 60)
24Miss2503Palsson, Miss. Torborg Danirafemale8.03134990921.0750NOS1[0, 60)
25Mrs2613Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...female38.01534707731.3875NOS1[0, 60)
26Mr2703Emir, Mr. Farred Chehabmale30.00026317.2250NOC0[0, 60)
27Mr2801Fortune, Mr. Charles Alexandermale19.03219950263.0000C23 C25 C27S1[240, 300)
28Miss2913O'Dwyer, Miss. Ellen "Nellie"female21.0003309597.8792NOQ0[0, 60)
29Mr3003Todoroff, Mr. Laliomale30.0003492167.8958NOS0[0, 60)
................................................
861Mr86202Giles, Mr. Frederick Edwardmale21.0102813411.5000NOS0[0, 60)
862Mrs86311Swift, Mrs. Frederick Joel (Margaret Welles Ba...female48.0001746625.9292D17S0[0, 60)
863Miss86403Sage, Miss. Dorothy Edith "Dolly"female21.082CA. 234369.5500NOS1[60, 120)
864Mr86502Gill, Mr. John Williammale24.00023386613.0000NOS0[0, 60)
865Mrs86612Bystrom, Mrs. (Karolina)female42.00023685213.0000NOS0[0, 60)
866Miss86712Duran y More, Miss. Asuncionfemale27.010SC/PARIS 214913.8583NOC0[0, 60)
867Mr86801Roebling, Mr. Washington Augustus IImale31.000PC 1759050.4958A24S0[0, 60)
868Mr86903van Melkebeke, Mr. Philemonmale30.0003457779.5000NOS0[0, 60)
869Master87013Johnson, Master. Harold Theodormale4.01134774211.1333NOS1[0, 60)
870Mr87103Balkic, Mr. Cerinmale26.0003492487.8958NOS0[0, 60)
871Mrs87211Beckwith, Mrs. Richard Leonard (Sallie Monypeny)female47.0111175152.5542D35S1[0, 60)
872Mr87301Carlsson, Mr. Frans Olofmale33.0006955.0000B51 B53 B55S0[0, 60)
873Mr87403Vander Cruyssen, Mr. Victormale47.0003457659.0000NOS0[0, 60)
874Mrs87512Abelson, Mrs. Samuel (Hannah Wizosky)female28.010P/PP 338124.0000NOC1[0, 60)
875Miss87613Najib, Miss. Adele Kiamie "Jane"female15.00026677.2250NOC0[0, 60)
876Mr87703Gustafsson, Mr. Alfred Ossianmale20.00075349.8458NOS1[0, 60)
877Mr87803Petroff, Mr. Nedeliomale19.0003492127.8958NOS0[0, 60)
878Mr87903Laleff, Mr. Kristomale30.0003492177.8958NOS0[0, 60)
879Mrs88011Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)female56.0011176783.1583C50C1[60, 120)
880Mrs88112Shelley, Mrs. William (Imanita Parrish Hall)female25.00123043326.0000NOS1[0, 60)
881Mr88203Markun, Mr. Johannmale33.0003492577.8958NOS0[0, 60)
882Miss88303Dahlberg, Miss. Gerda Ulrikafemale22.000755210.5167NOS0[0, 60)
883Mr88402Banfield, Mr. Frederick Jamesmale28.000C.A./SOTON 3406810.5000NOS0[0, 60)
884Mr88503Sutehall, Mr. Henry Jrmale25.000SOTON/OQ 3920767.0500NOS0[0, 60)
885Mrs88603Rice, Mrs. William (Margaret Norton)female39.00538265229.1250NOQ1[0, 60)
886Rare88702Montvila, Rev. Juozasmale27.00021153613.0000NOS0[0, 60)
887Miss88811Graham, Miss. Margaret Edithfemale19.00011205330.0000B42S0[0, 60)
888Miss88903Johnston, Miss. Catherine Helen "Carrie"female21.012W./C. 660723.4500NOS1[0, 60)
889Mr89011Behr, Mr. Karl Howellmale26.00011136930.0000C148C0[0, 60)
890Mr89103Dooley, Mr. Patrickmale32.0003703767.7500NOQ0[0, 60)

891 rows × 15 columns

In [62]:
train.Age.isnull().sum() 
Out[62]:
0
In [64]:
train.Age.isnull().any() 
Out[64]:
False
In [65]:
train.Age.describe() 
Out[65]:
count    891.000000
mean      29.392447
std       13.268389
min        0.420000
25%       21.000000
50%       30.000000
75%       35.000000
max       80.000000
Name: Age, dtype: float64
In [66]:
Embarked_Survived = pd.crosstab(train['Embarked'],train['Survived']) 
In [68]:
Embarked_Survived.plot(kind = 'bar') plt.xticks(rotation = 360) plt.title('Survived status by Embarked') plt.show() 
 
In [69]:
train
Out[69]:
 AppellationPassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedGroupTicketGroupFare
0Mr103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NOS0[0, 60)
1Mrs211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C0[60, 120)
2Miss313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NOS0[0, 60)
3Mrs411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S1[0, 60)
4Mr503Allen, Mr. William Henrymale35.0003734508.0500NOS0[0, 60)
5Mr603Moran, Mr. Jamesmale30.0003308778.4583NOQ0[0, 60)
6Mr701McCarthy, Mr. Timothy Jmale54.0001746351.8625E46S0[0, 60)
7Master803Palsson, Master. Gosta Leonardmale2.03134990921.0750NOS1[0, 60)
8Mrs913Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)female27.00234774211.1333NOS1[0, 60)
9Mrs1012Nasser, Mrs. Nicholas (Adele Achem)female14.01023773630.0708NOC1[0, 60)
10Miss1113Sandstrom, Miss. Marguerite Rutfemale4.011PP 954916.7000G6S1[0, 60)
11Miss1211Bonnell, Miss. Elizabethfemale58.00011378326.5500C103S0[0, 60)
12Mr1303Saundercock, Mr. William Henrymale20.000A/5. 21518.0500NOS0[0, 60)
13Mr1403Andersson, Mr. Anders Johanmale39.01534708231.2750NOS1[0, 60)
14Miss1503Vestrom, Miss. Hulda Amanda Adolfinafemale14.0003504067.8542NOS0[0, 60)
15Mrs1612Hewlett, Mrs. (Mary D Kingcome)female55.00024870616.0000NOS0[0, 60)
16Master1703Rice, Master. Eugenemale2.04138265229.1250NOQ1[0, 60)
17Mr1812Williams, Mr. Charles Eugenemale30.00024437313.0000NOS0[0, 60)
18Mrs1903Vander Planke, Mrs. Julius (Emelia Maria Vande...female31.01034576318.0000NOS0[0, 60)
19Mrs2013Masselmani, Mrs. Fatimafemale35.00026497.2250NOC0[0, 60)
20Mr2102Fynney, Mr. Joseph Jmale35.00023986526.0000NOS1[0, 60)
21Mr2212Beesley, Mr. Lawrencemale34.00024869813.0000D56S0[0, 60)
22Miss2313McGowan, Miss. Anna "Annie"female15.0003309238.0292NOQ0[0, 60)
23Mr2411Sloper, Mr. William Thompsonmale28.00011378835.5000A6S0[0, 60)
24Miss2503Palsson, Miss. Torborg Danirafemale8.03134990921.0750NOS1[0, 60)
25Mrs2613Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...female38.01534707731.3875NOS1[0, 60)
26Mr2703Emir, Mr. Farred Chehabmale30.00026317.2250NOC0[0, 60)
27Mr2801Fortune, Mr. Charles Alexandermale19.03219950263.0000C23 C25 C27S1[240, 300)
28Miss2913O'Dwyer, Miss. Ellen "Nellie"female21.0003309597.8792NOQ0[0, 60)
29Mr3003Todoroff, Mr. Laliomale30.0003492167.8958NOS0[0, 60)
................................................
861Mr86202Giles, Mr. Frederick Edwardmale21.0102813411.5000NOS0[0, 60)
862Mrs86311Swift, Mrs. Frederick Joel (Margaret Welles Ba...female48.0001746625.9292D17S0[0, 60)
863Miss86403Sage, Miss. Dorothy Edith "Dolly"female21.082CA. 234369.5500NOS1[60, 120)
864Mr86502Gill, Mr. John Williammale24.00023386613.0000NOS0[0, 60)
865Mrs86612Bystrom, Mrs. (Karolina)female42.00023685213.0000NOS0[0, 60)
866Miss86712Duran y More, Miss. Asuncionfemale27.010SC/PARIS 214913.8583NOC0[0, 60)
867Mr86801Roebling, Mr. Washington Augustus IImale31.000PC 1759050.4958A24S0[0, 60)
868Mr86903van Melkebeke, Mr. Philemonmale30.0003457779.5000NOS0[0, 60)
869Master87013Johnson, Master. Harold Theodormale4.01134774211.1333NOS1[0, 60)
870Mr87103Balkic, Mr. Cerinmale26.0003492487.8958NOS0[0, 60)
871Mrs87211Beckwith, Mrs. Richard Leonard (Sallie Monypeny)female47.0111175152.5542D35S1[0, 60)
872Mr87301Carlsson, Mr. Frans Olofmale33.0006955.0000B51 B53 B55S0[0, 60)
873Mr87403Vander Cruyssen, Mr. Victormale47.0003457659.0000NOS0[0, 60)
874Mrs87512Abelson, Mrs. Samuel (Hannah Wizosky)female28.010P/PP 338124.0000NOC1[0, 60)
875Miss87613Najib, Miss. Adele Kiamie "Jane"female15.00026677.2250NOC0[0, 60)
876Mr87703Gustafsson, Mr. Alfred Ossianmale20.00075349.8458NOS1[0, 60)
877Mr87803Petroff, Mr. Nedeliomale19.0003492127.8958NOS0[0, 60)
878Mr87903Laleff, Mr. Kristomale30.0003492177.8958NOS0[0, 60)
879Mrs88011Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)female56.0011176783.1583C50C1[60, 120)
880Mrs88112Shelley, Mrs. William (Imanita Parrish Hall)female25.00123043326.0000NOS1[0, 60)
881Mr88203Markun, Mr. Johannmale33.0003492577.8958NOS0[0, 60)
882Miss88303Dahlberg, Miss. Gerda Ulrikafemale22.000755210.5167NOS0[0, 60)
883Mr88402Banfield, Mr. Frederick Jamesmale28.000C.A./SOTON 3406810.5000NOS0[0, 60)
884Mr88503Sutehall, Mr. Henry Jrmale25.000SOTON/OQ 3920767.0500NOS0[0, 60)
885Mrs88603Rice, Mrs. William (Margaret Norton)female39.00538265229.1250NOQ1[0, 60)
886Rare88702Montvila, Rev. Juozasmale27.00021153613.0000NOS0[0, 60)
887Miss88811Graham, Miss. Margaret Edithfemale19.00011205330.0000B42S0[0, 60)
888Miss88903Johnston, Miss. Catherine Helen "Carrie"female21.012W./C. 660723.4500NOS1[0, 60)
889Mr89011Behr, Mr. Karl Howellmale26.00011136930.0000C148C0[0, 60)
890Mr89103Dooley, Mr. Patrickmale32.0003703767.7500NOQ0[0, 60)

891 rows × 15 columns

In [80]:
train['GroupCabin'] = np.where(train['Cabin'] == 'NO',0,1) 
In [82]:
GroupCabin_Survived = pd.crosstab(train['GroupCabin'],train['Survived']) GroupCabin_Survived.plot(kind = 'bar') plt.title('Survived status by GroupCabin') plt.xticks(rotation=360) plt.show() 
 
In [86]:
#对Age进行分组: 2**10>891分成10组, 组距为(最大值80-最小值0)/10 =8取9
bins = [0, 9, 18, 27, 36, 45, 54, 63, 72, 81, 90] train['GroupAge'] = pd.cut(train.Age, bins) GroupAge_Survived = pd.crosstab(train['GroupAge'], train['Survived']) GroupAge_Survived.plot(kind = 'bar') plt.title('Survived status by GroupAge') plt.show() 
 
In [87]:
train['Appellation'] = train.Appellation.map({'Mr': 0, 'Mrs': 1, 'Miss': 2, 'Master': 3, 'Rare': 4}) train.Appellation.unique() 
Out[87]:
array([0, 1, 2, 3, 4], dtype=int64)
In [89]:
train['Sex'] = train.Sex.map({'female':0,'male':1}) 
In [90]:
train.head()
Out[90]:
 AppellationPassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedGroupTicketGroupFareGroupCabinGroupAge
00103Braund, Mr. Owen Harris122.010A/5 211717.2500NOS0[0, 60)0(18, 27]
11211Cumings, Mrs. John Bradley (Florence Briggs Th...038.010PC 1759971.2833C85C0[60, 120)1(36, 45]
22313Heikkinen, Miss. Laina026.000STON/O2. 31012827.9250NOS0[0, 60)0(18, 27]
31411Futrelle, Mrs. Jacques Heath (Lily May Peel)035.01011380353.1000C123S1[0, 60)1(27, 36]
40503Allen, Mr. William Henry135.0003734508.0500NOS0[0, 60)0(27, 36]
In [95]:
train.loc[train['Age'] < 9, 'Age']=0 train.loc[(train['Age'] >= 9) & (train['Age'] < 18), 'Age'] = 1 train.loc[(train['Age'] >= 18) & (train['Age'] < 27), 'Age'] = 2 train.loc[(train['Age'] >= 27) & (train['Age'] < 36), 'Age'] = 3 train.loc[(train['Age'] >= 36) & (train['Age'] < 45), 'Age'] = 4 train.loc[(train['Age'] >= 45) & (train['Age'] < 54), 'Age'] = 5 train.loc[(train['Age'] >= 54) & (train['Age'] < 63), 'Age'] = 6 train.loc[(train['Age'] >= 63) & (train['Age'] < 72), 'Age'] = 7 train.loc[(train['Age'] >= 72) & (train['Age'] < 81), 'Age'] = 8 train.loc[(train['Age'] >= 81) & (train['Age'] < 90), 'Age'] = 9 train.Age.unique() 
Out[95]:
array([ 2.,  4.,  3.,  6.,  0.,  1.,  7.,  5.,  8.])
In [96]:
train.head()
Out[96]:
 AppellationPassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedGroupTicketGroupFareGroupCabinGroupAge
00103Braund, Mr. Owen Harris12.010A/5 211717.2500NOS0[0, 60)0(18, 27]
11211Cumings, Mrs. John Bradley (Florence Briggs Th...04.010PC 1759971.2833C85C0[60, 120)1(36, 45]
22313Heikkinen, Miss. Laina02.000STON/O2. 31012827.9250NOS0[0, 60)0(18, 27]
31411Futrelle, Mrs. Jacques Heath (Lily May Peel)03.01011380353.1000C123S1[0, 60)1(27, 36]
40503Allen, Mr. William Henry13.0003734508.0500NOS0[0, 60)0(27, 36]
In [97]:
#当SibSp和Parch都为0时, 则孤身一人.
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1 train.FamilySize.unique() 
Out[97]:
array([ 2,  1,  5,  3,  7,  6,  4,  8, 11], dtype=int64)
In [98]:
train.loc[train['Fare'] < 60, 'Fare'] = 0 train.loc[(train['Fare'] >= 60) & (train['Fare'] < 120), 'Fare'] = 1 train.loc[(train['Fare'] >= 120) & (train['Fare'] < 180), 'Fare'] = 2 train.loc[(train['Fare'] >= 180) & (train['Fare'] < 240), 'Fare'] = 3 train.loc[(train['Fare'] >= 240) & (train['Fare'] < 300), 'Fare'] = 4 train.loc[(train['Fare'] >= 300) & (train['Fare'] < 360), 'Fare'] = 5 train.loc[(train['Fare'] >= 360) & (train['Fare'] < 420), 'Fare'] = 6 train.loc[(train['Fare'] >= 420) & (train['Fare'] < 480), 'Fare'] = 7 train.loc[(train['Fare'] >= 480) & (train['Fare'] < 540), 'Fare'] = 8 train.loc[(train['Fare'] >= 540) & (train['Fare'] < 600), 'Fare'] = 9 train.Fare.unique() 
Out[98]:
array([ 0.,  1.,  4.,  2.,  8.,  3.])
In [99]:
train['Embarked'] = train.Embarked.map({'S': 0, 'C': 1, 'Q': 2}) 
In [100]:
train.drop(['PassengerId', 'Name', 'GroupAge', 'SibSp', 'Parch', 'Ticket', 'GroupFare', 'Cabin'], axis = 1, inplace =True) 
In [110]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split X=train[['Pclass', 'Appellation', 'Sex', 'Age', 'FamilySize', 'GroupTicket', 'Fare', 'GroupCabin', 'Embarked']] y=train['Survived'] #随机划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #逻辑回归模型初始化 lg = LogisticRegression() #训练逻辑回归模型 lg.fit(X_train, y_train) #用测试数据检验模型好坏 lg.score(X_test, y_test) 
Out[110]:
0.78212290502793291
In [111]:
from sklearn.tree import DecisionTreeClassifier
#树的最大深度为15, 内部节点再划分所需最小样本数为2, 叶节点最小样本数1, 最大叶子节点数10, 每次分类的最大特征数6
dt = DecisionTreeClassifier(max_depth=15, min_samples_split=2, min_samples_leaf=1, max_leaf_nodes=10, max_features=6) dt.fit(X_train, y_train) dt.score(X_test, y_test) 
Out[111]:
0.79329608938547491
In [126]:
#支持向量机SVM
from sklearn.cross_validation import cross_val_score, KFold from scipy.stats import sem # 构造一个便于交叉验证模型性能的函数(模块) def evaluate_cross_validation(clf, X, y, K): # KFold 函数需要如下参数:数据量, 叉验次数, 是否洗牌 cv = KFold(len(y), K, shuffle=True, random_state = 0) # 采用上述的分隔方式进行交叉验证,测试模型性能,对于分类问题,这些得分默认是accuracy,也可以修改为别的 scores = cross_val_score(clf, X, y, cv=cv) print (scores) print ('Mean score: %.3f (+/-%.3f)' % (scores.mean(), sem(scores))) # 使用线性核的SVC (后面会说到不同的核,结果可能大不相同) svc_linear = SVC(kernel='rbf')#‘linear’:线性核函数‘poly’:多项式核函数‘rbf’:径像核函数/高斯核‘sigmod’:sigmod核函数‘precomputed’:核矩阵 # 五折交叉验证 K = 5 evaluate_cross_validation(svc_linear, X_train, y_train, 5) 
 
[ 0.82517483  0.86013986  0.80985915  0.83802817  0.87323944]
Mean score: 0.841 (+/-0.011)
In [118]:
#线性分类器
from sklearn.linear_model import SGDClassifier
# 选择使用SGD分类器,适合大规模数据,随机梯度下降方法估计参数 clf = SGDClassifier() clf.fit(X_train, y_train) # 导入评价包 from sklearn import metrics y_train_predict = clf.predict(X_train) # 内测,使用训练样本进行准确性能评估 print(metrics.accuracy_score(y_train, y_train_predict)) # 标准外测,使用测试样本进行准确性能评估 y_predict = clf.predict(X_test) print(metrics.accuracy_score(y_test, y_predict)) 
 
0.651685393258
0.659217877095
In [123]:
#朴素贝叶斯分类器
from sklearn.naive_bayes import GaussianNB 
clf = GaussianNB() clf.fit(X_train, y_train) y_predict =clf.predict(X_test) from sklearn.metrics import accuracy_score print(accuracy_score(y_test, y_predict)) 
 
0.765363128492
 

转载于:https://www.cnblogs.com/USTC-ZCC/p/10018777.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值