**
1.ensemble 集成
**
VotingClassifier(投票分类器)由若干异质分类器组成,例如LR,RF朴素贝叶斯(GNB)
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics
#加载数据集
iris = datasets.load_iris()
#划分数据集
X_train, X_test, y_train, y_test = train_test_split( iris['data'],
iris['target'],
test_size=0.2 )
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
#随机森林,线性回归,高斯朴素贝叶斯
rf=RandomForestClassifier(n_estimators=5)
lr=LogisticRegression(solver='lbfgs',multi_class='multinomial')
GNB=GaussianNB()
Ensemble=VotingClassifier(estimators=[('lr',lr),('rf',rf),('gnb',GNB)],voting='hard')
#对比集成与单独
rf.fit(X_train,y_train)
Ensemble.fit(X_train,y_train)
print(metrics.accuracy_score(y_test,rf.predict(X_test)))
print(metrics.accuracy_score(y_test,Ensemble.predict(X_test)))
2.Multiclass 多分类多标签
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.datasets import load_digits
iris = datasets.load_iris()
digits=load_digits()
X_train, X_test, y_train, y_test = train_test_split( digits['data'],
digits['target'],
test_size=0.2 )
from sklearn.multiclass import OneVsOneClassifier
from sklearn.linear_model import LogisticRegression
ovo_lr = OneVsOneClassifier(LogisticRegression(solver='lbfgs',max_iter=200))
ovo_lr.fit(X_train,y_train)
print(metrics.accuracy_score(y_test,ovo_lr.predict(X_test)))
3.Multioutput 多输出
两个:MultiOutputRegressor 多输出回归
MultiOutputClassifier 多输出分类
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.datasets import load_digits
iris = datasets.load_iris()
digits=load_digits()
X_train, X_test, y_train, y_test = train_test_split( digits['data'],
digits['target'],
test_size=0.2 )
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
y_train_1st = y_train.copy()
y_train_1st[y_train<=4] =0
y_train_1st[np.logical_and(y_train>4,y_train<7)]=1
y_train_1st[y_train>=7]=2
#np.c_按行连接矩阵
y_train_multioutput = np.c_[y_train_1st,y_train]
MO=MultiOutputClassifier(RandomForestClassifier(n_estimators=100))
MO.fit(X_train,y_train_multioutput)
y_test_1st = y_test.copy()
y_test_1st[y_test<=4] =0
y_test_1st[np.logical_and(y_test>4,y_test<7)]=1
y_test_1st[y_test>=7]=2
y_test_multioutput=np.c_[y_test_1st,y_test]
print(MO.predict(X_test[:5,:]))
print(y_test_multioutput[:5,:])
4.选择模型model_selection
常见的:
cross_validate 评估交叉验证的表现
learning_curve: 建立学习曲线
GridSearchCV: 用交叉验证从网格中一组超参数搜索出最佳超参数
RandomizedSearchCV: 用交叉验证从一组随机超参数搜索出最佳超参数。
交叉验证:将数据平均随机分为K份,每次选一份作为训练集拟合参数,在剩下的K-1份验证集计算误差,所以称为交叉验证
from time import time
from scipy.stats import randint
#导入随机和网格追踪
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
#加载数据集
digits=datasets.load_digits()
X,y=digits.data,digits.target
#随机森林分类
RFC=RandomForestClassifier(n_estimators=20)
param_dist={"max_depth":[3,5],
"max_features":randint(1,11),
'min_samples_split':randint(2,11),
'criterion':['gini','entropy']
}
n_iter_search =20
#随机追踪
random_search =RandomizedSearchCV(RFC,param_distributions=param_dist,n_iter=n_iter_search,cv=5)
random_search.fit(X,y)
#查看参数
print(random_search.best_params_)
print(random_search.best_score_)
#网格追踪
param_grid ={
"max_depth":[3,5],
"max_features":[1,3,10],
'min_samples_split':[2,3,10],
'criterion':['gini','entropy']
}
grid_search =GridSearchCV(RFC,param_grid=param_grid,cv=5)
grid_search.fit(X,y)
print(grid_search.best_params_)
print(grid_search.best_score_)
5.Pipeline流水线
import numpy as np
X= np.array([[50,40,30,5,7,10,np.NaN],
[1.68,1.83,1.77,np.NaN,1.9,1.65,1.88]])
#转置矩阵
X=np.transpose(X)
#导入处理缺失值和规划化
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
pipe=Pipeline([('impute',SimpleImputer(missing_values=np.nan,strategy='mean')),
('normalize',MinMaxScaler())])
X_proc = pipe.fit_transform(X)
print(X_proc)