比赛相关部分练习总结
df_train = pd.read_csv('C:/Users/zhangy/Desktop/kaggle_competition_feature_engineering/kaggle_bike_competition_train.csv')
# print(train.shape)
# print(train.apply(lambda x:sum(x.isnull()))) #查看每一列缺失值的数量
df_train['month'] = pd.DatetimeIndex(df_train.datetime).month
df_train['day'] = pd.DatetimeIndex(df_train.datetime).dayofweek
df_train['hour'] = pd.DatetimeIndex(df_train.datetime).hour
df_train_origin = df_train
df_train=df_train.drop(['datetime'],axis=1)
df_train_target = df_train['count'] #训练集标签
df_train_data = df_train.drop(['count'],axis=1) #训练集数据
X_train,X_test,y_train,y_test=train_test_split(df_train_data,df_train_target,test_size=0.2,random_state=0)
# clf = RandomForestRegressor(n_estimators=100)
# clf.fit(X_train,y_train)
# print(clf.score(X_train,y_train))
# print(clf.score(X_test,y_test))
RandomForest:
sklearn.ensemble.RandomForestRegressor( n_estimators=10,
criterion='mse',
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.0,
max_features='auto',
max_leaf_nodes=None,
min_impurity_split=1e-07,
bootstrap=True,
oob_score=False,
n_jobs=1,
random_state=None,
verbose=0,
warm_start=False)
其中关于决策树的参数:
criterion: “mse”来选择最合适的节点。
splitter: ”best” or “random”(default=”best”)随机选择属性还是选择不纯度最大的属性,建议用默认。
max_features: 选择最适属性时划分的特征不能超过此值。
当为整数时,即最大特征数;当为小数时,训练集特征数*小数;
if “auto”, then max_features=sqrt(n_features).
If “sqrt”, thenmax_features=sqrt(n_features).
If “log2”, thenmax_features=log2(n_features).
If None, then max_features=n_features.
max_depth: (default=None)设置树的最大深度,默认为None,这样建树时,会使每一个叶节点只有一个类别,或是达到min_samples_split。
min_samples_split: 根据属性划分节点时,每个划分最少的样本数。
min_samples_leaf: 叶子节点最少的样本数。
max_leaf_nodes: (default=None)叶子树的最大样本数。
min_weight_fraction_leaf: