把很久以前做的泰坦尼克号的代码贴出来。
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 30 14:23:12 2017
@author: Yichengfan
"""
import pandas as pd
train = pd.read_csv(r"F:\TS\03_other_parts\Titanic\02_data\train.csv")
test = pd.read_csv(r"F:\TS\03_other_parts\Titanic\02_data\test.csv")
#先分别输出训练集和测试数据的基本信息,这是一个好习惯,可以对数据的规模,
#各个特征的数据类型以及是否缺失等,有一个整体的了解
print(train.info())
print(test.info())
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId 418 non-null int64
Pclass 418 non-null int64
Name 418 non-null object
Sex 418 non-null object
Age 332 non-null float64
SibSp 418 non-null int64
Parch 418 non-null int64
Ticket 418 non-null object
Fare 417 non-null float64
Cabin 91 non-null object
Embarked 418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
None
'''
selectd_features = ['Pclass','Sex', 'Age', 'Embarked',