原文:
http://ihoge.cn/2018/sklearn-ensemble.html
随机森林分类预测泰坦尼尼克号幸存者
import pandas as pd
import numpy as np
def read_dataset(fname):
data = pd.read_csv(fname, index_col=0)
data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
lables = data['Sex'].unique().tolist()
data['Sex'] = [*map(lambda x: lables.index(x) , data['Sex'])]
lables = data['Embarked'].unique().tolist()
data['Embarked'] = data['Embarked'].apply(lambda n: lables.index(n))
data = data.fillna(0)
return data
train = read_dataset('code/datasets/titanic/train.csv')
from sklearn.model_selection import train_test_split
y = train['Survived'].values
X = train.drop(['Survived'], axis=1).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print("X_train_shape:", X_train.shape, " y_train_shape:", y_train.shape)
print("X_test_shape:", X_test.shape," y_test_shape:", y_test.shape)
X_train_shape: (712, 7) y_train_sh