【Python】sklearn笔记

DecisionTreeRegressor

sklearn.tree.DecisionTreeRegressor
决策树回归
参数:
max_leaf_nodes 最大叶子节点数

train_test_split

sklearn.model_selection.train_test_split
划分数据集
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
X为输入特征,y为输出

mean_absolute_error

sklearn.metrics.mean_absolute_error
获取平均绝对误差
val_mae = mean_absolute_error(val_predictions, val_y)

RandomForestRegressor

随机森林回归
sklearn.ensemble import RandomForestRegressor

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
melb_preds = forest_model.predict(val_X)

SimpleImputer

sklearn.impute.SimpleImputer
缺失值处理

my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))
# 区别fit_transform 训练好再transform
# transform 直接用已经训练好的直接transform

imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

添加列 表示是否有缺失值在这里插入图片描述

miss_col = [col for col in X_train.columns if X_train[col].isnull().any()]
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()
print(miss_col)
for col in miss_col:
    X_train_plus[col+'_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col+'_missing'] = X_valid_plus[col].isnull()
my_imputer = SimpleImputer() # Your code here
final_X_train = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
final_X_valid = pd.DataFrame(my_imputer.transform(X_valid_plus))

final_X_train.columns = X_train_plus.columns
final_X_valid.columns = X_valid_plus.columns

LabelEncoder

sklearn.preprocessing.LabelEncoder
把离散标签转换为0~n-1的数字

low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]	# 得到离散标签 且类型小于10

label_encoder = LabelEncoder()
for col in object_cols:		# object_cols 是获得的有限离散标签
    label_X_train[col] = label_encoder.fit_transform(X_train[col])
    label_X_valid[col] = label_encoder.transform(X_valid[col])

OneHotEncoder

sklearn.preprocessing.OneHotEncoder
Onehot 标签

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

Pipeline

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

cross_val_score

sklearn.model_selection.cross_val_score

交叉验证

my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer()),('model',RandomForestRegressor(n_estimators=50,random_state=0))])
scores = -1 * cross_val_score(my_pipeline, X, y,cv=5,scoring='neg_mean_absolute_error')
# scores 返回五折的验证分

xgboost.XGBRegressor

梯度提升

my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)
my_model.fit(X_train, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(X_valid, y_valid)], 
             verbose=False)

删除含缺失值的列
X_train.dropna(axis=1)
axis=0 时删除含缺失值的行

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值