【Python】常用工具包系列4 —— sklearn

笔者:YY同学Serendipity

生命不息,代码不止。好玩的项目尽在GitHub!


1. 数据缺失填充


# Simple Imputer
from sklearn.impute import SimpleImputer

# median:中位数填充
# mean:平均数填充
# most_frequent:频率填充(离散)

imputer = SimpleImputer(strategy='median') 
data['some_col'] = imputer.fit_transform(data['some_col'])


# KNN Imputer
from sklearn.impute import KNNImputer
imputer = KNNImputer() 
data['some_col'] = imputer.fit_transform(data['some_col'])

2. 数据正则化


# 适用于没有 outliers 的正态分布
from sklearn.preprocessing import StandardScaler
data['some_col1'] = StandardScaler.fit_transform(data['some_col1'])


# 适用于有 outliers 的正态分布
from sklearn.preprocessing import RobustScaler
data['some_col2'] = RobustScaler.fit_transform(data['some_col2'])


# 适用于非正态分布
from sklearn.preprocessing import MinMaxScaler
data['some_col3'] = MinMaxScaler.fit_transform(data['some_col3'])

3. 数据特征编码


# one-hot 编码
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False).fit(data['some_col'])
ohe.categories_


# Label 编码
from sklearn.preprocessing import LabelEncoder
data['new_col'] = LabelEncoder().fit_transform(data['origin_col'])

4. 数据集处理


# 划分训练集测试集
from sklearn.model_selection import train_test_split

y = data.iloc[:,-1]
X = data.iloc[:,:-1]

# train/test = 7/3
y_train, y_test, X_train, X_test = train_test_split(y, X, train_size=0.7)


# 交叉验证
from sklearn.model_selection import cross_validate

# r2: Root Mean Square

cv = cross_validate(model, X_train, y_train, cv=5, scoring='r2') 

5. 评估标准


# 画出混淆矩阵
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(model, X_test, y_test, normalize='all')


# 计算 precision = TP/(TP+FP)
from sklearn.metrics import precision_score
score = precision_score(y_test, y_predict)


# 计算 recall/sensitivity = TP/(TP+FN)
from sklearn.metrics import recall_score
score = recall_score(y_test, y_predict)


# 计算 accuracy = (TP+TN)/(TP+FP+TN+FN)
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, y_predict)


# 计算 F1-score = 2PR/(P+R)
from sklearn.metrics import f1_score
score = f1_score(y_test, y_predict)


# 计算 Mean Square Error (MSE)
from sklearn.metrics import mean_squared_error
score = np.sqrt(mean_squared_error(y_test, y_predict))

6. 特征处理


# 特征排列
from sklearn.inspection import permutation_importance
mdl = LinearRegression().fit(X_train, y_train)
permutation_score = permutation_importance(mdl, X_train, y_train, n_repeats=10)

7. 机器学习模型(分类)


# Dummy 分类器
from sklearn.dummy import DummyClassifier
mdl = DummyClassifier()
mdl.fit(X_train, y_train)
mdl.predict(X_test)


# K邻近分类器
from sklearn.neighbors import KNeighborsClassifier
mdl = KNeighborsClassifier()
mdl.fit(X_train, y_train)
mdl.predict(X_test)


# 随机梯度下降分类器
from sklearn.linear_model import SGDClassifier
mdl = SGDClassifier()
mdl.fit(X_train, y_train)
mdl.predict(X_test)


# 支持向量机(SVM)
from sklearn.svm import SVC
mdl = SVC(kernel='linear')
mdl.fit(X_train, y_train)
mdl.predict(X_test)

from sklearn.svm import LinearSVC
mdl = LinearSVC()
mdl.fit(X_train, y_train)
mdl.predict(X_test)


# 决策树分类器
from sklearn.tree import DecisionTreeClassifier
mdl = DecisionTreeClassifier()
mdl.fit(X_train, y_train)
mdl.predict(X_test)

8. 机器学习模型(回归)


# Dummy 回归器
from sklearn.dummy import DummyRegressor
mdl = DummyRegressor(strategy='mean')
mdl.fit(X_train, y_train)
mdl.predict(X_test)


# 线性回归器
from sklearn.linear_model import LinearRegression
mdl = LinearRegression()
mdl.fit(X_train, y_train)
mdl.predict(X_test)


# 逻辑斯蒂回归器
from sklearn.linear_model import LogisticRegression
mdl = LogisticRegression()
mdl.fit(X_train, y_train)
mdl.predict(X_test)


# 随机梯度下降回归器
from sklearn.linear_model import SGDRegressor
mdl = SGDRegressor(loss='squared_loss')
mdl.fit(X_train, y_train)
mdl.predict(X_test)


# K邻近回归器
from sklearn.neighbors import KNeighborsRegressor
mdl = KNeighborsRegressor()
mdl.fit(X_train, y_train)
mdl.predict(X_test)

9. 集成学习模型


# 随机森林分类器
from sklearn.ensemble import RandomForestClassifier
mdl = RandomForestClassifier()
mdl.fit(X_train, y_train)
mdl.predict(X_test)


# 袋装分类器
from sklearn.ensemble import BaggingClassifier
mdl = BaggingClassifier(base_estimator=knn_class, n_estimator=15, max_samples=0.5, max_features=0.5)
mdl.fit(X_train, y_train)
mdl.predict(X_test)


# 投票分类器
from sklearn.ensemble import VotingClassifier
estimator = [('lr', LinearRegression()),('sgd', SGDRegressor()),('rf', RandomForestClassifier())]
mdl = VotingClassifier(estimators=estimator, voting='hard')
mdl.fit(X_train, y_train)
mdl.predict(X_test)


# 栈分类器
from sklearn.ensemble import StackingClassifier
estimator = [('lr', LinearRegression()),('sgd', SGDRegressor()),('rf', RandomForestClassifier())]
mdl = StackingClassifier(estimators=estimator, final_estimator=LogisticRegression(), cv=5)
mdl.fit(X_train, y_train)
mdl.predict(X_test)


# 独立树
from sklearn.ensemble import IsolationForest
mdl = IsolationForest()
mdl.fit(X_train, y_train)
mdl.predict(X_test)


# XGBoost 分类器
from xgboost import XGBClassifier
mdl = XGBClassifier()
mdl.fit(X_train, y_train)
mdl.predict(X_test)

10. 超参数搜索


# Grid Search
from sklearn.model_selection import GridSearchCV
params = {'alpha':[1,10,100], 'eta':[0.1, 0.01, 0.001, 0.0001]}
mdl = GridSearchCV(model, params, cv=5, n_jobs=-1)
mdl.fit(X_train, y_train)


# Random Search
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats
params = {'num':stats.randint(1,20), 'p':[1,2,3,4]}
mdl = RandomizedSearchCV(model, params, n_iter=20, cv=5, n_jobs=-1)
mdl.fit(X_train, y_train)


# Bayesian Optimization Search
from skopt import BayesSearchCV
from skopt.space import Integer
params = {'num':Integer(1,10, prior='uniform'), 'p':[1,2,3,4]}
mdl = BayesSearchCV(model, params, n_iter=20, cv=5, n_jobs=-1)
mdl.fit(X_train, y_train)

11. 流水线


# pipeline
from sklearn.pipeline import Pipeline, make_pipeline
pipe = make_pipeline(SimpleImputer(), StandardScaler())
pipe.fit(X_train)


# 列变换(对不同的列应用不同的 pipeline)
from sklearn.compose import ColumnTransformer
preproc_pipe = ColumnTransformer([
	("num_prep", num_pipe, X_train['num_col']),  # number
	("cat_prep", cat_pipe, X_train['cat_col'])   # category
])
preproc_pipe.fit(X_train)
X_train_preproc = preproc_pipe.transform(X_train)
X_test_preproc = preproc_pipe.transform(X_test)


# 函数变换
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer
def replace_func(df):
	# doing something
	return df

replace = FunctionTransformer(lambda df: replace_func(df))
replace_pipe = make_column_transformer((replace, ['replace_name']))

12. 降维算法


# 主成分分析法
from sklearn.decomposition import PCA
pca = PCA(n_components=50)  # 降维到50
X_train_proj = pca.fit_transform(X_train)  # 降维
data_origin = pca.inverse_transform(X_train_proj)  # 还原

  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值