机器学习,数据分析
DecisionTreeRegressor
sklearn.tree.DecisionTreeRegressor
决策树回归器
参数:
max_leaf_nodes 最大叶子节点数
train_test_split
sklearn.model_selection.train_test_split
划分数据集
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
X为输入特征,y为输出
mean_absolute_error
sklearn.metrics.mean_absolute_error
获取平均绝对误差
val_mae = mean_absolute_error(val_predictions, val_y)
RandomForestRegressor
随机森林回归器
sklearn.ensemble import RandomForestRegressor
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
melb_preds = forest_model.predict(val_X)
SimpleImputer
sklearn.impute.SimpleImputer
缺失值处理
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))
# 区别fit_transform 训练好再transform
# transform 直接用已经训练好的直接transform
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns
添加列 表示是否有缺失值![在这里插入图片描述](https://i-blog.csdnimg.cn/blog_migrate/088551ceaa4b895c205d5e53dcd4fb89.png)
miss_col = [col for col in X_train.columns if X_train[col].isnull().any()]
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()
print(miss_col)
for col in miss_col:
X_train_plus[col+'_missing'] = X_train_plus[col].isnull()
X_valid_plus[col+'_missing'] = X_valid_plus[col].isnull()
my_imputer = SimpleImputer() # Your code here
final_X_train = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
final_X_valid = pd.DataFrame(my_imputer.transform(X_valid_plus))
final_X_train.columns = X_train_plus.columns
final_X_valid.columns = X_valid_plus.columns
LabelEncoder
sklearn.preprocessing.LabelEncoder
把离散标签转换为0~n-1的数字
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"] # 得到离散标签 且类型小于10
label_encoder = LabelEncoder()
for col in object_cols: # object_cols 是获得的有限离散标签
label_X_train[col] = label_encoder.fit_transform(X_train[col])
label_X_valid[col] = label_encoder.transform(X_valid[col])
OneHotEncoder
sklearn.preprocessing.OneHotEncoder
Onehot 标签
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))
# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)
# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
Pipeline
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
cross_val_score
sklearn.model_selection.cross_val_score
交叉验证
my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer()),('model',RandomForestRegressor(n_estimators=50,random_state=0))])
scores = -1 * cross_val_score(my_pipeline, X, y,cv=5,scoring='neg_mean_absolute_error')
# scores 返回五折的验证分
xgboost.XGBRegressor
梯度提升
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)
my_model.fit(X_train, y_train,
early_stopping_rounds=5,
eval_set=[(X_valid, y_valid)],
verbose=False)
删除含缺失值的列
X_train.dropna(axis=1)
axis=0 时删除含缺失值的行