代码
加载数据
import os
import pandas as pd
TITANIC_PATH = os.path.join("datasets", "titanic")
def load_titanic_data(filename, titanic_path=TITANIC_PATH):
csv_path = os.path.join(titanic_path, filename)
return pd.read_csv(csv_path)
train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")
y_train = train_data["Survived"]
查看数据结构
train_data.head()
对幸存率比较有用的是:
数字属性:age\sibsp\parch\fare
文本属性:pclass\sex\embarked\cabin
其他属性可以忽略
train_data.info()
分析发现
- age属性有缺失情况,缺失占比为19%,拟采用中位数填补
- embarked有缺失,缺失只有一点点,也用中位数填补
- cabin缺失情况太严重,就不采用这个属性进行训练
查看文本属性值值分布情况
预处理数据
处理数值属性
中位数替代
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names]
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
num_pipeline = Pipeline([
("select_numeric", DataFrameSelector(["Age", "SibSp", "Parch", "Fare"])),
("imputer", SimpleImputer(strategy="median")),
])
#num_pipeline.fit_transform(train_data)
处理文本属性
独热编码
# Inspired from stackoverflow.com/questions/25239958
class MostFrequentImputer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
#X是一个dataframe结构,属性名称为三个的表格
#c获取的是列名,详情请见获取列名的三种方法:https://www.cnblogs.com/wqbin/p/11845042.html
#x.columns获取的也是列名
#self.most_frequent_最终获取的是最多的属性值
self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
index=X.columns)
return self
#使用最多的数值填充缺失值
def transform(self, X, y=None):
return X.fillna(self.most_frequent_)
from sklearn.preprocessing import OneHotEncoder
cat_pipeline = Pipeline([
("select_cat", DataFrameSelector(["Pclass", "Sex", "Embarked"])),
("imputer", MostFrequentImputer()),
("cat_encoder", OneHotEncoder(sparse=False)),
])
#cat_pipeline.fit_transform(train_data)
将前面的数字和文本联合起来一起训练
from sklearn.pipeline import FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])
X_train = preprocess_pipeline.fit_transform(train_data)
X_train
开始训练
使用SVM进行训练
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
svm_clf = SVC(gamma="auto")
svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)
def display_scores(scores):
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard deviation:", scores.std())
display_scores(svm_scores)
结果为
使用随机森林进行训练
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
display_scores(forest_scores)
测试集性能测试略过