%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
导入数据
# 从excel表导入数据
df_t = pd.read_excel(r'D:\EdgeDownloadPlace\3dd40612152202ee8440f82a3d277008\train.xlsx')
# 删除uid列
df_t = df_t.drop(columns='uid')
# 把数据中的'?'换成每一列的众数
for col in df_t.columns:
idx = df_t[col].value_counts().index
df_t[col][df_t[col] == '?'] = idx[0] if idx[0] != '?' else idx[1]
# 把pandas.DataFrame数据转化为numpy.darray数据 元素类型为np.float32
arr_t = df_t.values.astype(np.float32)
交叉验证找分数最高的n_estimators
score_tt = []
for i in range(0,300,15):
rfc = RandomForestClassifier(n_estimators = i+1