import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
inputData = pd.read_csv('./train.csv')
trainData = inputData.copy(deep = True)
拷贝一份trainData用于数据处理,不动原数据。(其实好像没啥用)
del trainData['PassengerId']
del trainData['Cabin']
del trainData['Ticket']
删除机舱和乘客id
trainData.describe()
Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
min | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
50% | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
max | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
trainData['Age'].mode(1)
0 24.0
dtype: float64
pandas中mode可以用来求众数,试一试.
不给具体列的时候,每一列都是众数。
axis = 1是统计每一行的众数
axis = 0是我对于一行,抽取每一列的众数
# print(trainData['Name'])
trainData['Name'] = trainData['Name'].str.extract(r", (.*?)\.", expand=False)
print(trainData["Name"])
trainData.rename(columns={
'Name':'Title'}, inplace=True)
0 Mr
1 Mrs
2 Miss
3 Mrs
4 Mr
...
886 Rev
887 Miss
888 Miss
889 Mr
890 Mr
Name: Name, Length: 891, dtype: object
#将title合并为几个组
trainData["Title"]=trainData["Title"].replace(['Mr','Don'],'Mr')
trainData["Title"]=trainData["Title"].replace(['Mrs','Miss','Mme','Ms','Lady','Dona','Mlle'],'Ms')
trainData["Title"]=trainData["Title"].replace(['Sir','Major','Col','Capt'],'Major')
trainData["Title"]=trainData["Title"].replace(['Master','Jonkheer','the Countess'],'Jonkheer')
trainData["Title"]=trainData["Title"].replace(['Rev','Dr'],'Rev')
把title分类
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
a = LabelEncoder().fit_transform(trainData['Sex'])
OneHotEncoder( sparse=False ).fit_transform(a.reshape(-1,1)) # 注意: 这里把 a 用 reshape 转换成 2-D array
listUniq = trainData['Title'].unique().reshape(-1,1)
print(listUniq)
enc = OneHotEncoder()
enc.fit(listUniq)
enc.transform(trainData[['Title']]).toarray()
# b = LabelEncoder().fit_transform(trainData['Title'])
# trainData['Title'] = OneHotEncoder( sparse=False ).fit_transform(b.reshape(-1,1))
[['Mr']
['Ms']
['Jonkheer']
['Rev']
['Major']]
array([[0., 0., 1., 0., 0.],
[0., 0., 0., 1., 0.],
[0., 0., 0., 1., 0.],
...,
[0., 0., 0., 1., 0.],
[0., 0., 1., 0., 0.],
[0., 0., 1., 0., 0.]])
对性别和类别进行onehot编码。成功是成功了,发现不会用……无语子,不知道怎么放进模型)
trainData[trainData['Title'] == 'Mr']['Age'].mode(1)
0 19.0
1 25.0
dtype: float64
trainData[trainData['Title'] == 'Ms']['Age'].mode(1)
0 24.0
dtype: float64
trainData[trainData['Title'] == 'Major