import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV data_raw = pd.read_csv('train.csv') data_val = pd.read_csv('test.csv') # 列名转换成小写 # data_raw.columns=data_raw.columns.str.lower() #print(data_raw.head()) #print(data_raw.info()) # 统计获救人员情况 #print(data_raw['Survived'].value_counts()) # 绘图统计 sns.countplot(data_raw['Survived']) plt.show() # 查看数据集中的空值 #print(data_raw.isnull().sum()) #print(data_val.isnull().sum()) # 数据清洗 #print(data_raw.describe()) # 补足年龄、票价和登船港口空缺值,年龄和票价用数据的中位数 data_raw['Age'].fillna(data_raw['Age'].median(), inplace=True) data_val['Age'].fillna(data_val['Age'].median(), inplace=True) data_raw['Fare'].fillna(data_raw['Fare'].median(), inplace=True) data_val['Fare'].fillna(data_val['Fare'].median(), inplace=True) # mode取embarked列出现频率最高的值,返回是一个Series data_raw['Embarked'].fillna(data_raw['Embarked'].mode()[0], inplace=True) data_val['Embarked'].fillna(data_val['Embarked'].mode()[0], inplace=True) #print(data_raw.isnull().sum()) #print(data_val.isnull().sum()) # 删除没有用的列 data_raw.drop(['PassengerId', 'Cabin'], axis=1, inplace=True) data_val.drop(['PassengerId', 'Cabin'], axis=1, inplace=True) print(data_raw.info()) # 可选部分 # 构建新的特征:家庭成员的数量 data_raw['family_size'] = data_raw['SibSp'] + data_raw['Parch'] + 1 data_val['family_size'] = data_val['SibSp'] + data_val['Parch'] + 1 print(data_raw.info()) # 提取姓名中的称谓 # DataFrame中的apply方法就是将函数应用到由列或行形成的一维数组上 data_raw['title'] = data_raw['Name'].apply(lambda x: x.split(', ')[1]).apply(lambda x: x.split('. ')[0]) data_val['title'] = data_val['Name'].apply(lambda x: x.split(', ')[1]).apply(lambda x: x.split('. ')[0]) print(data_raw['title'].value_counts()) # 离散化票价 # qcut根据值的频率进行分组来选择箱子的的均匀间隔,即每个箱子中含有的数的数量相同(这里就是每组的元素个数一致) data_raw['fare_bin'] = pd.qcut(data_raw['Fare'], 4) data_val['fare_bin'] = pd.qcut(data_val['Fare'], 4) print(data_raw['fare_bin'].value_counts()) # 离散化年龄 data_raw['age_bin'] = pd.cut(data_raw['Age'], 5) data_val['age_bin'] = pd.cut(data_val['Age'], 5) print(data_raw['age_bin'].value_counts()) # 基于LabelEncoder构建新字段 # 创建新字段type_code,值是LabelEncoder对type列进行one-hot编码 label = LabelEncoder() data_raw['sex_code'] = label.fit_transform(data_raw['Sex']) data_val['sex_code'] = label.fit_transform(data_val['Sex']) print(data_raw['sex_code']) data_raw['embarked_code'] = label.fit_transform(data_raw['Embarked']) data_val['embarked_code'] = label.fit_transform(data_val['Embarked']) data_raw['fare_bin_code'] = label.fit_transform(data_raw['fare_bin']) data_val['fare_bin_code'] = label.fit_transform(data_val['fare_bin']) data_raw['age_bin_code'] = label.fit_transform(data_raw['age_bin']) data_val['age_bin_code'] = label.fit_transform(data_val['age_bin']) data_raw['title_code'] = label.fit_transform(data_raw['title']) data_val['title_code'] = label.fit_transform(data_val['title']) # pd.set_option('max_columns', 20) pd.options.display.max_columns = 20 print(data_raw.head()) print(data_raw.columns) # 特征选择 # 目标集的列名 Target = ['Survived'] # 特征的列名 data_columns = ['Pclass', 'sex_code', 'age_bin_code', 'fare_bin_code', 'embarked_code', 'family_size', 'title_code'] # 分割训练集和测试集 X_train, X_test, y_train, y_test = train_test_split( data_raw[data_columns], data_raw[Target], test_size=0.3, random_state=5) # max_features:每个决策树的最大特征数 n_jobs:多线程进行训练 # random_state如果希望可以重现,固定随机数种子。随机森林本质就是随机的,设置随机数种子可以彻底改变构建的模型 # 不设置每次构建的模型不同 # 实例化随机森林算法对象 rfc = RandomForestClassifier(max_features='auto', random_state=1, n_jobs=-1) param_grid = { 'n_estimators': [50, 100, 400, 700, 1000], # 森林里的树木数量 'criterion': ['gini', 'entropy'], # 分割特征的测量方法 'max_depth': [5, 8, 15, 25, 30], # 树的最大深度 'min_samples_split': [2, 4, 10, 12, 16], # 节点划分最少样本数 'min_samples_leaf': [1, 5, 10] # 叶子节点的最小样本数 } # 创建网络搜索对象,5折交叉验证,评估标准:scoring gs = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1) # # 训练模型 #gs.fit(X_train, y_train) # # 平均验证精度最高分 #print(gs.best_score_) # # 网格搜索后的最优参数 #print(gs.best_params_) # 使用最佳参数构建随机森林 #y_train = y_train.values.ravel() rfc = RandomForestClassifier(max_features='auto', random_state=1, n_jobs=-1, criterion='entropy', max_depth=18, min_samples_leaf=5, min_samples_split=2, n_estimators=700) rfc.fit(X_train, y_train) score = rfc.score(X_test, y_test) print(score) y_predict = rfc.predict(X_test) # 查看精确率、召回率,F1score from sklearn.metrics import classification_report report = classification_report(y_test, y_predict) print(report) # 查看roc_auc指标 from sklearn.metrics import roc_auc_score roc = roc_auc_score(y_test, y_predict) print(roc) # 预测test.csv中的数据,特征需要和训练集选择的特征一致 y_predict = rfc.predict(data_val[data_columns]) print(y_predict) data_val = pd.read_csv('test.csv') # 新增一列预测结果 data_val['Survived'] = y_predict # 导出excel #data_val.to_excel('test1.xlsx')
04-01
429
12-21