目前,在pycharm中可以直接安装0.22版本的scikit-learn了:
在python 或者 anacoda中,也可以直接升级:
# python
pip install --upgrade scikit-learn
# conda
conda install scikit-learn
目录:
- stacking 模型融合
- 特征处理中利用KNN近邻填充–KNNImputer
- 便捷的roc-auc-curve曲线绘制
- 特征重要性评估
- KNN变换器
1. Stacking 模型融合
常见的模型融合思想有三种
- Bagging
- Boosting
- Stacking
前两种在scikit-learn中都有现成的算法包,Stacking没有提供。新版本中提供了Stacking融合思想的两种算法:StackingClassifier 和 StackingRegressor。在sklearn中主要有两个参数:
- estimators:前一级的子模型
- final_estimator:最终的子模型,根据分类和回归,对应了StackingClassifier 和 StackingRegressor;
demo
from sklearn.ensemble import StackingClassifier, StackingRegressor, RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
# get data
X, y = load_iris(return_X_y=True)
# split the data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=20)
# build estimator
estimators = [
('rf', RandomForestClassifier(n_estimators=10, random_state=20)),
('svr', make_pipeline(
StandardScaler(),
LinearSVC(random_state=20)
))
]
# build stacking model
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
clf.fit(x_train, y_train)
score = clf.score(x_test, y_test)
print(score)
2 特征处理中利用KNN近邻填充–KNNImputer
在特征预处理阶段,存在多种缺失值填充方法,有中位数,众数,特定值等等。同时缺失值的邻居填充法也是很有用的一种。
目前,sklearn 中的 impute填充对应的填充方法:
- SimpleImputer:中位数,众数填充等等
- IterativeImputer:迭代填充【目前处于试验阶段】
- MissingIndicator:缺失值索引器
- KNNImputer
ps:
目前IterativeImputer还处于试验阶段,需要导入使用的话,要提前导入experimental
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
demo
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
from sklearn.impute import MissingIndicator
from sklearn.impute import KNNImputer
imp_mean = IterativeImputer(random_state=0)
X1 = np.array([
[7, 3, 2],
[4, np.nan, 6],
[10, 5, 9]
])
X2= np.array([
[np.nan, 2, 3],
[4, np.nan, 6],
[10, np.nan, 9]
])
imp_mean.fit(X1)
X_pred = imp_mean.transform(X2)
print(X_pred)
nn_imputer = KNNImputer(
missing_values=np.nan,
n_neighbors=5,
weights="distance",
metric="nan_euclidean",)
nn_imputer.fit(X1)
X2_pred = nn_imputer.transform((X2))
print(X2_pred)
3. 便捷的roc-auc-curve曲线绘制
在分类任务中,roc-auc评估是一个非常重要的指标,但是在sklearn中没有现成的方法可以调用,需要自己编写:
def roc_exercise(y, pre_score, pos_label):
from matplotlib import pyplot as plt
# 取出负标签,没必要
neg_label = tuple(set(y) - {pos_label})[0]
# 先排序,然后逆序输出
pre_score.sort()
thresholds = pre_score[::-1]
fpr_list = []
tpr_list = []
for thre in thresholds:
TP_count = 0
FP_count = 0
FN_count = 0
TN_count = 0
fpr = 0
tpr = 0
pre_list = []
for score in pre_score:
if score >= thre :
pre_list.append(pos_label)
else:
pre_list.append(neg_label)
for i in range(len(pre_list)):
if y[i] == pre_list[i]:
if pre_list[i] == pos_label:
TP_count += 1
else:
TN_count += 1
else:
if pre_list[i] == pos_label:
FP_count += 1
else:
FN_count += 1
tpr = TP_count / (TP_count + FN_count)
fpr = FP_count / (FP_count + TN_count)
tpr_list.append(tpr)
fpr_list.append(fpr)
print('thresholds', thresholds)
print('fpr_list', fpr_list)
print('tpr_list', tpr_list)
plt.plot(fpr_list, tpr_list, color='b')
plt.grid(True)
plt.title('ROC cureve')
plt.show()
return None
# execute
y = np.array([1, 1, 2, 2, 2])
pre_score = np.array([0.1, 0.4, 0.35, 0.8, 0.6])
pos_label = 2
roc_exercise(y, pre_score, pos_label)
在新的sklearn版本中,添加了画roc-auc曲线的方法:
from sklearn.metrics import roc_auc_score, plot_roc_curve
demo1
from sklearn.datasets import make_classification
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, plot_roc_curve
X, y = make_classification(n_samples=1000, n_classes=4, n_informative=16)
clf = SVC(decision_function_shape='ovo', probability=True)
clf.fit(X, y)
y_pred = clf.predict_proba(X)
score = roc_auc_score(y, y_pred, multi_class='ovo')
print(score)
demo2
from sklearn.datasets import make_classification
from sklearn.svm import SVC
from sklearn.metrics import plot_roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
X, y = make_classification(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
svc = SVC(random_state=42)
svc.fit(X_train, y_train)
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)
svc_disp = plot_roc_curve(estimator=svc, X=X_test, y=y_test)
rfc_disp = plot_roc_curve(estimator=rfc, X=X_test, y=y_test, ax=svc_disp.ax_)
rfc_disp.figure_.suptitle('ROC curve comparison')
plt.show()
4. 特征重要性评估
可以利用算法来选择特征的重要性,同时新版本的sklearn中也提供了特征重要性评估的方法:
from sklearn.inspection import permutation_importance
demo:
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.datasets import make_classification
from matplotlib import pyplot as plt
X, y = make_classification(n_samples=1000, n_features=5, n_classes=2, random_state=20)
rf = RandomForestClassifier(random_state=10)
rf.fit(X, y)
result = permutation_importance(rf, X, y, n_repeats=10, random_state=0)
fig, ax = plt.subplots()
sorted_idx = result.importances_mean.argsort()
ax.boxplot(result.importances[sorted_idx].T,
vert=False, labels=range(X.shape[1]))
ax.set_title("Permutation Importance of each feature")
ax.set_ylabel("Features")
fig.tight_layout()
plt.show()
5. KNN变换器
这个先贴代码吧,也没有搞得太清楚。
from tempfile import TemporaryDirectory, TemporaryFile
from sklearn.neighbors import KNeighborsTransformer
from sklearn.manifold import Isomap
from sklearn.pipeline import make_pipeline
from sklearn.datasets import make_classification
# make data
X, y = make_classification(random_state=0)
with TemporaryDirectory(prefix='sklean_cache_') as tmpdir:
estimator = make_pipeline(
KNeighborsTransformer(n_neighbors=10, mode='distance'),
Isomap(n_neighbors=10, metric='precomputed'),
memory=tmpdir
)
estimator.fit(X)
estimator.set_params(isomap__n_neighbors=5)
estimator.fit(X)