import numpy as np
import pandas as pd
train_data = pd.read_csv(r"C:\Users\sheng\1.JupyterNotes\titanic\train.csv")
train_data.columns
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
dtype='object')
Task01:查看缺失值。
train_data.isna().shape
(891, 12)
train_data.notna().shape
(891, 12)
train_data.isnull().head() #查看所有缺失值. 返回的是由布尔值构成的同样size的DataFrame. 数据显示较大,这里选择前5行
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | False | False | False | False | False | False | False | False | False | False | True | False |
1 | False | False | False | False | False | False | False | False | False | False | False | False |
2 | False | False | False | False | False | False | False | False | False | False | True | False |
3 | False | False | False | False | False | False | False | False | False | False | False | False |
4 | False | False | False | False | False | False | False | False | False | False | True | False |
train_data.notnull().head() #和上面效果一样,只是相反的判断。
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | True | True | True | True | True | True | True | True | True | True | False | True |
1 | True | True | True | True | True | True | True | True | True | True | True | True |
2 | True | True | True | True | True | True | True | True | True | True | False | True |
3 | True | True | True | True | True | True | True | True | True | True | True | True |
4 | True | True | True | True | True | True | True | True | True | True | False | True |
train_data.isnull().any() #查看某列是否有缺失值,有返回True,没有False。所以,Age, Cabin, Embarked三列有缺失值。
PassengerId False
Survived False
Pclass False
Name False
Sex False
Age True
SibSp False
Parch False
Ticket False
Fare False
Cabin True
Embarked True
dtype: bool
train_data.isnull().all() #查看某列是否全部缺失,train_data中没有数据全部缺失的列。
PassengerId False
Survived False
Pclass False
Name False
Sex False
Age False
SibSp False
Parch False
Ticket False
Fare False
Cabin False
Embarked False
dtype: bool
train_data.isnull().sum()/len(train_data) #简单运算返回缺失值的比例
PassengerId 0.000000
Survived 0.000000
Pclass 0.000000
Name 0.000000
Sex 0.000000
Age 0.198653
SibSp 0.000000
Parch 0.000000
Ticket 0.000000
Fare 0.000000
Cabin 0.771044
Embarked 0.002245
dtype: float64
Task02 对缺失值进行处理
一般有两个函数,dropna()和fillna(),先研究下参数
.dropna:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html?highlight=dropna#pandas.DataFrame.dropna
axis:可选0和1两个值。0表示删除含有缺失的行;1表示删除含有缺失值的列。
how:可选any和all,默认为any。any表示如果某行或某列有缺失值,则删除该行或该列。all表示如果某行或某列缺失值比例为100%,则删除该行或该列。
thresh:必须为整数,可选参数。表示剩余非na值的数量大于等于thresh保留,其它删除。
subset:数组形式参数,可选。用于传入列名称的数组/列表,删除特定列中的空值行。
inplace:是否原地操作,默认False.
.fillna:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html?highlight=fillna#pandas.DataFrame.fillna
value:变量、字典、Series、DataFrame;用于填充缺失值,或为指定的DataFrame列的缺失值使用字典/Series/DataFrame的值填充。
method:有backfill, bfill, pad, ffill, None可选,默认为None。backfull/bfill表示用后一个非缺失值填充;pad/ffill表示用前一个非缺失值填充;None是用指定值填充。
axis:沿着哪个方向填充。0表示行,1表示列。
inplace:是否原地操作,默认False.
limit:限制填充数量。
train_data.shape
(891, 12)
train_data.dropna(subset=["Age"]).shape #删除年龄缺失的行
(714, 12)
train_data.shape #原数据未变化
(891, 12)
train_data.dropna(subset=["Cabin"]).shape #删除Cabin列缺失的行
(204, 12)
train_data.drop("Cabin",axis=1).shape #删除Cabin列
(891, 11)
train_data.dropna(thresh=11).shape #每行至多一个缺失值。
(733, 12)
train_data.dropna(thresh=12).shape #保留无缺失值的行。
(183, 12)
train_data["Age"].fillna(train_data["Age"].mean()).isnull().sum() #均值填充Age列缺失值
0
train_data.fillna(method="bfill").isnull().sum() #后值填充,Cabin有一个空值。
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 0
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 1
Embarked 0
dtype: int64
train_data.fillna(method="bfill").tail() #最后一个是空值,没有后值,无法填充。
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.00 | B42 | S |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.00 | B42 | S |
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 26.0 | 1 | 2 | W./C. 6607 | 23.45 | C148 | S |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.00 | C148 | C |
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.75 | NaN | Q |
train_data.fillna(method="ffill").isnull().sum() #前值填充,有一个空值
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 0
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 1
Embarked 0
dtype: int64
train_data.fillna(method="ffill").head() #第一个是空值,没有前值,无法填充
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | C85 | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | C123 | S |
但简单的前值和后值填充都不太合适,希望Age按照均值填充,Embarked按照众数填充,Cabin直接删除。
dic = {"Age":train_data["Age"].mean(),"Embarked":np.array(train_data["Embarked"].mode())[0]}
train_data.drop("Cabin",axis=1).fillna(dic).isnull().sum()
#第一次做,Embarked两个空值没填上,train_data["Embarked"].mode()返回是个对象,不是字符串
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 0
SibSp 0
Parch 0
Ticket 0
Fare 0
Embarked 0
dtype: int64
np.array([train_data["Embarked"]==np.nan]).sum()
0
np.array([train_data["Embarked"]==None]).sum()
0
train_data["Embarked"].isnull().sum()
2
type(np.nan),type(None),type(pd.NaT),type(""),type(" ")
(float, NoneType, pandas._libs.tslibs.nattype.NaTType, str, str)
【思考】检索空缺值用np.nan,None以及.isnull()哪个更好,这是为什么?如果其中某个方式无法找到缺失值,原因又是为什么?
数据缺失有两个原因:(1)真的没有数据;(2)在数据搜集时候产生了错误。None是python关键字,属于单独的NoneType,Nonetype只有一个值,就是None,在python中用于定义null值的,如果用.isnull是可以检测到的,如果进行运算会报错。可以理解为没有结果,即真的没有数据,而不是搜集时候产生的错误;np.nan只能通过np.nan()创建一种方法,与自身比较结果也是False,即这个是有值的,但是值是多少不清楚,两个比较大概率不一样,可以运算,运算返回还是NaN。isnull可以检测出上述两类,其它如"?",""," "(即问号、无内容、空格)均会被判断为有值。
np.nan == np.nan #与自己不同
False
df = pd.DataFrame([[None,1,2],["?",""," "]]) #直接用np.nan会报错~
df[1][0]=np.nan
df[2][0]=np.NaN
df
0 | 1 | 2 | |
---|---|---|---|
0 | None | NaN | NaN |
1 | ? |
df.isnull() #"?",""," "均被判断为有值
0 | 1 | 2 | |
---|---|---|---|
0 | True | True | True |
1 | False | False | False |
df[0][0]/2 #None计算,报错
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-37-5ec1b6f840b7> in <module>()
----> 1 df[0][0]/2 #None计算,报错
TypeError: unsupported operand type(s) for /: 'NoneType' and 'int'
df[1][0]/2 #NaN计算,返回NaN
nan
Task03 重复值处理
train_data.duplicated().sum() #查看train_data重复值
0
train_data.nunique() #查看train_data的取值类别数量
PassengerId 891
Survived 2
Pclass 3
Name 891
Sex 2
Age 88
SibSp 7
Parch 7
Ticket 681
Fare 248
Cabin 147
Embarked 3
dtype: int64
.duplicated() 参考链接:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.duplicated.html?highlight=duplicate
.drop_duplicates() 参考链接:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html#pandas.DataFrame.drop_duplicates
train_data.drop_duplicates(subset=["Ticket"],keep="first").shape #以Ticket为例,保留第一个重复数据,删除其它的。
(681, 12)
dic = {"Age":train_data["Age"].mean(),"Embarked":np.array(train_data["Embarked"].mode())[0]}
data = train_data.drop("Cabin",axis=1).fillna(dic)
data.to_csv("data.csv") #存下清洗的数据
Task 04 分箱操作
分箱操作即将连续性变量转化为离散型变量
d_cut_1 = data.copy()
d_cut_1["Age_Group"] = pd.cut(d_cut_1["Age"],bins=5,labels=[1,2,3,4,5]) #将年龄平均分为5组
d_cut_1.to_csv("d_cut_1.csv") #保存数据
d_cut_1.isnull().sum()
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 0
SibSp 0
Parch 0
Ticket 0
Fare 0
Embarked 0
Age_Group 0
dtype: int64
d_cut_1.head(10)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | Age_Group | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.000000 | 1 | 0 | A/5 21171 | 7.2500 | S | 2 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.000000 | 1 | 0 | PC 17599 | 71.2833 | C | 3 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.000000 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S | 2 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.000000 | 1 | 0 | 113803 | 53.1000 | S | 3 |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.000000 | 0 | 0 | 373450 | 8.0500 | S | 3 |
5 | 6 | 0 | 3 | Moran, Mr. James | male | 29.699118 | 0 | 0 | 330877 | 8.4583 | Q | 2 |
6 | 7 | 0 | 1 | McCarthy, Mr. Timothy J | male | 54.000000 | 0 | 0 | 17463 | 51.8625 | S | 4 |
7 | 8 | 0 | 3 | Palsson, Master. Gosta Leonard | male | 2.000000 | 3 | 1 | 349909 | 21.0750 | S | 1 |
8 | 9 | 1 | 3 | Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | female | 27.000000 | 0 | 2 | 347742 | 11.1333 | S | 2 |
9 | 10 | 1 | 2 | Nasser, Mrs. Nicholas (Adele Achem) | female | 14.000000 | 1 | 0 | 237736 | 30.0708 | C | 1 |
d_cut_2 = data.copy()
#将年龄按照指定间隔分为5组
d_cut_2["Age_Group"] = pd.cut(d_cut_2["Age"],bins=[0,5,15,30,50,90],right=False,labels=[1,2,3,4,5])
d_cut_2.to_csv("d_cut_2.csv")
d_cut_2.isnull().sum()
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 0
SibSp 0
Parch 0
Ticket 0
Fare 0
Embarked 0
Age_Group 0
dtype: int64
d_cut_2.head(10)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | Age_Group | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.000000 | 1 | 0 | A/5 21171 | 7.2500 | S | 3 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.000000 | 1 | 0 | PC 17599 | 71.2833 | C | 4 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.000000 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S | 3 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.000000 | 1 | 0 | 113803 | 53.1000 | S | 4 |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.000000 | 0 | 0 | 373450 | 8.0500 | S | 4 |
5 | 6 | 0 | 3 | Moran, Mr. James | male | 29.699118 | 0 | 0 | 330877 | 8.4583 | Q | 3 |
6 | 7 | 0 | 1 | McCarthy, Mr. Timothy J | male | 54.000000 | 0 | 0 | 17463 | 51.8625 | S | 5 |
7 | 8 | 0 | 3 | Palsson, Master. Gosta Leonard | male | 2.000000 | 3 | 1 | 349909 | 21.0750 | S | 1 |
8 | 9 | 1 | 3 | Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | female | 27.000000 | 0 | 2 | 347742 | 11.1333 | S | 3 |
9 | 10 | 1 | 2 | Nasser, Mrs. Nicholas (Adele Achem) | female | 14.000000 | 1 | 0 | 237736 | 30.0708 | C | 2 |
d_cut_3 = data.copy()
#按照年龄将数据量,用qcut函数
d_cut_3["Age_Group"] = pd.qcut(d_cut_3["Age"],q=[0,.1,.3,.5,.7,.9],labels=[1,2,3,4,5])
d_cut_3.to_csv("d_cut_3.csv")
d_cut_3.isnull().sum() #89个样本未分组,即10%样本未分组。
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 0
SibSp 0
Parch 0
Ticket 0
Fare 0
Embarked 0
Age_Group 89
dtype: int64
d_cut_3.head(10)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | Age_Group | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.000000 | 1 | 0 | A/5 21171 | 7.2500 | S | 2.0 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.000000 | 1 | 0 | PC 17599 | 71.2833 | C | 5.0 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.000000 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S | 3.0 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.000000 | 1 | 0 | 113803 | 53.1000 | S | 5.0 |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.000000 | 0 | 0 | 373450 | 8.0500 | S | 5.0 |
5 | 6 | 0 | 3 | Moran, Mr. James | male | 29.699118 | 0 | 0 | 330877 | 8.4583 | Q | 3.0 |
6 | 7 | 0 | 1 | McCarthy, Mr. Timothy J | male | 54.000000 | 0 | 0 | 17463 | 51.8625 | S | NaN |
7 | 8 | 0 | 3 | Palsson, Master. Gosta Leonard | male | 2.000000 | 3 | 1 | 349909 | 21.0750 | S | 1.0 |
8 | 9 | 1 | 3 | Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | female | 27.000000 | 0 | 2 | 347742 | 11.1333 | S | 3.0 |
9 | 10 | 1 | 2 | Nasser, Mrs. Nicholas (Adele Achem) | female | 14.000000 | 1 | 0 | 237736 | 30.0708 | C | 1.0 |
Task05 对文本变量进行转换
train_data.dtypes
PassengerId int64
Survived int64
Pclass int64
Name object
Sex object
Age float64
SibSp int64
Parch int64
Ticket object
Fare float64
Cabin object
Embarked object
dtype: object
train_data.isnull().sum()
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
train_data.nunique() #Cabin有147个不同值,且缺失值较多,不能直接归到5类
PassengerId 891
Survived 2
Pclass 3
Name 891
Sex 2
Age 88
SibSp 7
Parch 7
Ticket 681
Fare 248
Cabin 147
Embarked 3
dtype: int64
train_data.head(10)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
5 | 6 | 0 | 3 | Moran, Mr. James | male | NaN | 0 | 0 | 330877 | 8.4583 | NaN | Q |
6 | 7 | 0 | 1 | McCarthy, Mr. Timothy J | male | 54.0 | 0 | 0 | 17463 | 51.8625 | E46 | S |
7 | 8 | 0 | 3 | Palsson, Master. Gosta Leonard | male | 2.0 | 3 | 1 | 349909 | 21.0750 | NaN | S |
8 | 9 | 1 | 3 | Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | female | 27.0 | 0 | 2 | 347742 | 11.1333 | NaN | S |
9 | 10 | 1 | 2 | Nasser, Mrs. Nicholas (Adele Achem) | female | 14.0 | 1 | 0 | 237736 | 30.0708 | NaN | C |
train_data.sort_values(by="Cabin",ascending=True)[0:220:6]
#排序切片观察Cabin值的特点,可看出Cabin分ABCDEF和缺失值7类
#想法:将Cabin左边第一个字符提取做出来左边编号,再转化为数字
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
583 | 584 | 0 | 1 | Ross, Mr. John Hugo | male | 36.0 | 0 | 0 | 13049 | 40.1250 | A10 | C |
867 | 868 | 0 | 1 | Roebling, Mr. Washington Augustus II | male | 31.0 | 0 | 0 | PC 17590 | 50.4958 | A24 | S |
96 | 97 | 0 | 1 | Goldschmidt, Mr. George B | male | 71.0 | 0 | 0 | PC 17754 | 34.6542 | A5 | C |
329 | 330 | 1 | 1 | Hippach, Miss. Jean Gertrude | female | 16.0 | 0 | 1 | 111361 | 57.9792 | B18 | C |
61 | 62 | 1 | 1 | Icard, Miss. Amelie | female | 38.0 | 0 | 0 | 113572 | 80.0000 | B28 | NaN |
487 | 488 | 0 | 1 | Kent, Mr. Edward Austin | male | 58.0 | 0 | 0 | 11771 | 29.7000 | B37 | C |
484 | 485 | 1 | 1 | Bishop, Mr. Dickinson H | male | 25.0 | 1 | 0 | 11967 | 91.0792 | B49 | C |
679 | 680 | 1 | 1 | Cardeza, Mr. Thomas Drake Martinez | male | 36.0 | 0 | 1 | PC 17755 | 512.3292 | B51 B53 B55 | C |
671 | 672 | 0 | 1 | Davidson, Mr. Thornton | male | 31.0 | 1 | 0 | F.C. 12750 | 52.0000 | B71 | S |
195 | 196 | 1 | 1 | Lurette, Miss. Elise | female | 58.0 | 0 | 0 | PC 17569 | 146.5208 | B80 | C |
802 | 803 | 1 | 1 | Carter, Master. William Thornton II | male | 11.0 | 1 | 2 | 113760 | 120.0000 | B96 B98 | S |
110 | 111 | 0 | 1 | Porter, Mr. Walter Chamberlain | male | 47.0 | 0 | 0 | 110465 | 52.0000 | C110 | S |
711 | 712 | 0 | 1 | Klaber, Mr. Herman | male | NaN | 0 | 0 | 113028 | 26.5500 | C124 | S |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
341 | 342 | 1 | 1 | Fortune, Miss. Alice Elizabeth | female | 24.0 | 3 | 2 | 19950 | 263.0000 | C23 C25 C27 | S |
716 | 717 | 1 | 1 | Endres, Miss. Caroline Louise | female | 38.0 | 0 | 0 | PC 17757 | 227.5250 | C45 | C |
430 | 431 | 1 | 1 | Bjornstrom-Steffansson, Mr. Mauritz Hakan | male | 28.0 | 0 | 0 | 110564 | 26.5500 | C52 | S |
698 | 699 | 0 | 1 | Thayer, Mr. John Borland | male | 49.0 | 1 | 1 | 17421 | 110.8833 | C68 | C |
230 | 231 | 1 | 1 | Harris, Mrs. Henry Birkhardt (Irene Wallach) | female | 35.0 | 1 | 0 | 36973 | 83.4750 | C83 | S |
332 | 333 | 0 | 1 | Graham, Mr. George Edward | male | 38.0 | 0 | 1 | PC 17582 | 153.4625 | C91 | S |
269 | 270 | 1 | 1 | Bissette, Miss. Amelia | female | 35.0 | 0 | 0 | PC 17760 | 135.6333 | C99 | S |
218 | 219 | 1 | 1 | Bazzani, Miss. Albina | female | 32.0 | 0 | 0 | 11813 | 76.2917 | D15 | C |
457 | 458 | 1 | 1 | Kenyon, Mrs. Frederick R (Marion) | female | NaN | 1 | 0 | 17464 | 51.8625 | D21 | S |
52 | 53 | 1 | 1 | Harper, Mrs. Henry Sleeper (Myna Haxtun) | female | 49.0 | 1 | 0 | PC 17572 | 76.7292 | D33 | C |
740 | 741 | 1 | 1 | Hawksford, Mr. Walter James | male | NaN | 0 | 0 | 16988 | 30.0000 | D45 | S |
21 | 22 | 1 | 2 | Beesley, Mr. Lawrence | male | 34.0 | 0 | 0 | 248698 | 13.0000 | D56 | S |
303 | 304 | 1 | 2 | Keane, Miss. Nora A | female | NaN | 0 | 0 | 226593 | 12.3500 | E101 | Q |
707 | 708 | 1 | 1 | Calderhead, Mr. Edward Pennington | male | 42.0 | 0 | 0 | PC 17476 | 26.2875 | E24 | S |
166 | 167 | 1 | 1 | Chibnall, Mrs. (Edith Martha Bowerman) | female | NaN | 0 | 1 | 113505 | 55.0000 | E33 | S |
434 | 435 | 0 | 1 | Silvey, Mr. William Baird | male | 50.0 | 1 | 0 | 13507 | 55.9000 | E44 | S |
262 | 263 | 0 | 1 | Taussig, Mr. Emil | male | 52.0 | 1 | 1 | 110413 | 79.6500 | E67 | S |
128 | 129 | 1 | 3 | Peter, Miss. Anna | female | NaN | 1 | 1 | 2668 | 22.3583 | F E69 | C |
148 | 149 | 0 | 2 | Navratil, Mr. Michel ("Louis M Hoffman") | male | 36.5 | 0 | 2 | 230080 | 26.0000 | F2 | S |
618 | 619 | 1 | 2 | Becker, Miss. Marion Louise | female | 4.0 | 2 | 1 | 230136 | 39.0000 | F4 | S |
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
9 | 10 | 1 | 2 | Nasser, Mrs. Nicholas (Adele Achem) | female | 14.0 | 1 | 0 | 237736 | 30.0708 | NaN | C |
17 | 18 | 1 | 2 | Williams, Mr. Charles Eugene | male | NaN | 0 | 0 | 244373 | 13.0000 | NaN | S |
train_data["Sex_num"]=train_data["Sex"].map({"male":1,"female":2}) #将Sex属性通过map方法转化为二值变量
train_data.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Sex_num | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 1 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 2 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 2 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 2 |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 1 |
train_data["Embarked"].value_counts()
S 644
C 168
Q 77
Name: Embarked, dtype: int64
train_data["Embarked_num"]=train_data["Embarked"].replace(["S","C","Q"],[1,2,3]) #用replace方法将登船港编码为数字
train_data.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Sex_num | Embarked_num | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 1 | 1.0 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 2 | 2.0 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 2 | 1.0 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 2 | 1.0 |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 1 | 1.0 |
train_data["Cabin_str"]=train_data["Cabin"].str[0] #将Cabin左边第一个字符提取做出来左边编号,再转化为数字(但不完全准确)
train_data["Cabin_str"].value_counts()
C 59
B 47
D 33
E 32
A 15
F 13
G 4
T 1
Name: Cabin_str, dtype: int64
train_data[train_data["Cabin_str"]=="T"] #查看下预期之外的数据
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Sex_num | Embarked_num | Cabin_num | Cabin_str | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
339 | 340 | 0 | 1 | Blackwell, Mr. Stephen Weart | male | 45.0 | 0 | 0 | 113784 | 35.5 | T | S | 1 | 1.0 | T | T |
train_data["Cabin_num"]=train_data["Cabin_str"].replace(["A","B","C","D","E","F","G","T"],[1,2,3,4,5,6,7,8])
#用replace转化为数字
train_data.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Sex_num | Embarked_num | Cabin_num | Cabin_str | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 1 | 1.0 | NaN | NaN |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 2 | 2.0 | 3.0 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 2 | 1.0 | NaN | NaN |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 2 | 1.0 | 3.0 | C |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 1 | 1.0 | NaN | NaN |
pd.get_dummies(train_data["Age"]) #将One-hot编码concat到原数据上即可
0.42 | 0.67 | 0.75 | 0.83 | 0.92 | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 | ... | 62.0 | 63.0 | 64.0 | 65.0 | 66.0 | 70.0 | 70.5 | 71.0 | 74.0 | 80.0 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
7 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
16 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
19 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
23 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
26 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
27 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
29 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
861 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
862 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
863 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
864 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
865 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
866 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
867 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
868 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
869 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
870 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
871 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
872 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
873 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
874 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
875 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
876 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
877 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
878 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
879 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
880 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
881 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
882 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
883 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
884 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
885 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
886 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
887 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
888 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
889 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
890 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
891 rows × 88 columns
for col in ["Age","Cabin_str","Sex"]: #get_dummies函数进行One-hot编码
x = pd.get_dummies(train_data[col],prefix=col,dummy_na=True) #Cabin有null值
train_data = pd.concat([train_data,x],axis=1)
train_data.columns
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare',
...
'Cabin_str_C', 'Cabin_str_D', 'Cabin_str_E', 'Cabin_str_F',
'Cabin_str_G', 'Cabin_str_T', 'Cabin_str_nan', 'Sex_female', 'Sex_male',
'Sex_nan'],
dtype='object', length=218)
get_dummies(): https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.get_dummies.html?highlight=get_dummies#pandas.Series.str.get_dummies
Task06 纯文本特征提取
我们用.str.extract()函数+正则表达式实现
train_data["Name"] #先观察下数据,方便编写正则表达式,pattern是 xxx,[target].xxxx
0 Braund, Mr. Owen Harris
1 Cumings, Mrs. John Bradley (Florence Briggs Th...
2 Heikkinen, Miss. Laina
3 Futrelle, Mrs. Jacques Heath (Lily May Peel)
4 Allen, Mr. William Henry
5 Moran, Mr. James
6 McCarthy, Mr. Timothy J
7 Palsson, Master. Gosta Leonard
8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
9 Nasser, Mrs. Nicholas (Adele Achem)
10 Sandstrom, Miss. Marguerite Rut
11 Bonnell, Miss. Elizabeth
12 Saundercock, Mr. William Henry
13 Andersson, Mr. Anders Johan
14 Vestrom, Miss. Hulda Amanda Adolfina
15 Hewlett, Mrs. (Mary D Kingcome)
16 Rice, Master. Eugene
17 Williams, Mr. Charles Eugene
18 Vander Planke, Mrs. Julius (Emelia Maria Vande...
19 Masselmani, Mrs. Fatima
20 Fynney, Mr. Joseph J
21 Beesley, Mr. Lawrence
22 McGowan, Miss. Anna "Annie"
23 Sloper, Mr. William Thompson
24 Palsson, Miss. Torborg Danira
25 Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...
26 Emir, Mr. Farred Chehab
27 Fortune, Mr. Charles Alexander
28 O'Dwyer, Miss. Ellen "Nellie"
29 Todoroff, Mr. Lalio
...
861 Giles, Mr. Frederick Edward
862 Swift, Mrs. Frederick Joel (Margaret Welles Ba...
863 Sage, Miss. Dorothy Edith "Dolly"
864 Gill, Mr. John William
865 Bystrom, Mrs. (Karolina)
866 Duran y More, Miss. Asuncion
867 Roebling, Mr. Washington Augustus II
868 van Melkebeke, Mr. Philemon
869 Johnson, Master. Harold Theodor
870 Balkic, Mr. Cerin
871 Beckwith, Mrs. Richard Leonard (Sallie Monypeny)
872 Carlsson, Mr. Frans Olof
873 Vander Cruyssen, Mr. Victor
874 Abelson, Mrs. Samuel (Hannah Wizosky)
875 Najib, Miss. Adele Kiamie "Jane"
876 Gustafsson, Mr. Alfred Ossian
877 Petroff, Mr. Nedelio
878 Laleff, Mr. Kristo
879 Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)
880 Shelley, Mrs. William (Imanita Parrish Hall)
881 Markun, Mr. Johann
882 Dahlberg, Miss. Gerda Ulrika
883 Banfield, Mr. Frederick James
884 Sutehall, Mr. Henry Jr
885 Rice, Mrs. William (Margaret Norton)
886 Montvila, Rev. Juozas
887 Graham, Miss. Margaret Edith
888 Johnston, Miss. Catherine Helen "Carrie"
889 Behr, Mr. Karl Howell
890 Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object
train_data["Title"]=train_data["Name"].str.extract("([A-Za-z]+)\.",expand=False) #提取.之前的任意数量的字母组合
正则表达式可参考:https://c.runoob.com/front-end/854/
train_data["Title"].value_counts()
Mr 517
Miss 182
Mrs 125
Master 40
Dr 7
Rev 6
Mlle 2
Major 2
Col 2
Don 1
Ms 1
Capt 1
Sir 1
Lady 1
Mme 1
Jonkheer 1
Countess 1
Name: Title, dtype: int64
train_data.to_csv("my_data.csv")