import pandas as pd;
from sklearn.tree import DecisionTreeClassifier; # 导入决策树
from sklearn.linear_model import LogisticRegression; # 导入逻辑回归
from sklearn.model_selection import cross_val_score;
from sklearn.impute import SimpleImputer;
from sklearn.preprocessing import OneHotEncoder;
import matplotlib.pyplot as plt
titanic_data = pd.read_csv(r"D:/titanic/train.csv")
# 初步查看数据长啥样
titanic_data.head()
# 查看数据有没有缺失值
titanic_data.info()
可以看出,有部分列有缺失值,这个后面要做相应处理。
1、数据预处理
1.1 重复值处理
titanic_data.duplicated().sum()
可以看出,数据比较好,没有缺失值
1.2 从经验看来(这步其实在实际中是跟业务强相关),cabin、name、ticket、passengerId这些特征,与最终是否获救是没有直接关系的,因此可以把这四列删除
titanic_data.drop(['Cabin', 'Name', 'Ticket', 'PassengerId'], axis=1,inplace=True);
1.3 填补缺失值
# 填补缺失值(对于Age,用均值)
imp_mean = SimpleImputer();
titanic_data.loc[:, 'Age'] = imp_mean.fit_transform(titanic_data.loc[:, 'Age'].values.reshape(-1, 1));
# 填补缺失值(对于Embarked,用众数)
titanic_data.loc[:, 'Embarked'].value_counts()
imp_mostFre = SimpleImputer(strategy='most_frequent');
titanic_data.loc[:, 'Embarked'] = imp_mostFre.fit_transform(titanic_data.loc[:, 'Embarked'].values.reshape(-1, 1));
1.4 对Sex和Embarked进行OneHotEncoder
wait_to_encode = titanic_data.loc[:, ['Sex', 'Embarked']];
one_hot_encoder = OneHotEncoder(categories='auto');
tmp = one_hot_encoder.fit(wait_to_encode);
result = tmp.transform(wait_to_encode).toarray()
new_columns = ['Female', 'Male', 'Embarked_C', 'Embarked_Q', 'Embarked_S'];
titanic_data = pd.concat([titanic_data, pd.DataFrame(result, columns = new_columns)], axis=1)
titanic_data.drop(['Sex', 'Embarked'], axis=1, inplace = True)
titanic_data.head()
1.5 分特征数据集和标签数据集
titanic_feature = titanic_data.loc[:, titanic_data.columns != 'Survived'];
titanic_target = titanic_data.loc[:, titanic_data.columns == 'Survived'];
2、特征预处理
2.1 方差过滤
from sklearn.feature_selection import VarianceThreshold
varice_Selector = VarianceT