机器学习分类+回归
模型选择
from sklearn import datasets
from sklearn import model_selection
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.svm import NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import numpy as np
cl1=ExtraTreesClassifier()
cl2=RandomForestClassifier(random_state=1)
cl3=AdaBoostClassifier()
cl4=GradientBoostingClassifier()
cl5=GaussianProcessClassifier()
cl6=LogisticRegression()
cl7=RidgeClassifier()
cl8=SGDClassifier()
cl9=GaussianNB()
cl10=KNeighborsClassifier(n_neighbors=1)
cl11=MLPClassifier()
cl12=SVC()
cl13=NuSVC()
cl14=DecisionTreeClassifier()
cl15=ExtraTreeClassifier()
cl16=XGBClassifier()
cl17=LGBMClassifier()
print('3-fold cross validation:\n')
for clf, label in zip([cl1, cl2, cl3,cl4,cl5,cl6,cl7,cl8,cl9,cl10,
cl11,cl12,cl13,cl14,cl15,cl16,cl17],
['ExtraTreesClassifier',
'RandomForestClassifier',
'AdaBoostClassifier',
'GradientBoostingClassifier',
'GaussianProcessClassifier',
'LogisticRegression',
'RidgeClassifier',
'SGDClassifier',
'GaussianNB',
'KNeighborsClassifier',
'MLPClassifier',
'SVC',
'NuSVC',
'DecisionTreeClassifier',
'ExtraTreeClassifier',
'XGBClassifier',
'LGBMClassifier']):
scores = model_selection.cross_val_score(clf,X_train,y_train,cv=3,scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]"%(scores.max(),scores.std(),label))
模型调参
from sklearn.model_selection import GridSearchCV
penaltys=['l2']#l1 或l2正则化
cs = [1.0,1.1,1.2,1.3,1.4,1.5]
param_grid = {'random_state':[1]}
#print(param_grid)
gsc = GridSearchCV(RandomForestClassifier(),param_grid)
# print(X_train)
gsc.fit(X_train,y_train)
print('最佳模型参数的评分:',gsc.best_score_)
print('最优参数')
best_params = gsc.best_estimator_.get_params()
print(best_params)
for param_name in sorted(param_grid.keys()):
print(param_name,':',best_params[param_name])
常用模型调参
XGBClassifier调参
gsc.score(X_train,y_train)
gsc.score(X_test,y_test)
from sklearn.model_selection import cross_val_score
acc_list=[]
for i in range(0,100):
acc=cross_val_score(XGBClassifier(n_estimators=i,min_child_weight=0.5,max_depth=6),X_train,y_train,cv=15).mean()
acc_list.append(acc)
import matplotlib.pyplot as plt
plt.style.use('seaborn')
plt.plot(acc_list)
np.argmax(acc_list)
acc_list[1]
rf=XGBClassifier(n_estimators=1,min_child_weight=0.5,max_depth=6)
rf.fit(X_train,y_train)
rf.score(X_train,y_train)
rf.score(X_test,y_test)
随机森林调参
#TODO:模型训练和预测
from sklearn.ensemble import ExtraTreesClassifier
etsc_params={
'class_weight':'balanced',
'criterion':'gini',
'max_depth':None,
'min_samples_split':3,
'n_estimators':30
}
etsc=ExtraTreesClassifier(**etsc_params)
etsc.fit(X_train,y_train)
y_pred = etsc.predict(X_test)
print(etsc.score(X_train,y_train))
模型评估
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
cv_split = ShuffleSplit(n_splits=5, train_size=0.7, test_size=0.2)
score_ndarray = cross_val_score(etsc, X_train, y_train, cv=cv_split)
print(score_ndarray)
score_ndarray.mean()
线性回归调参
from sklearn.model_selection import
penaltys=['l1','l2']#l1 或l2正则化
cs = [1.0,1.1,1.2,1.3,1.4,1.5]
param_grid = {'penalty':penaltys,'C':cs}
#print(param_grid)
gsc = GridSearchCV(LogisticRegression(),param_grid)
# print(X_train)
gsc.fit(X_train,y_train)
print('最佳模型参数的评分:',gsc.best_score_)
print('最优参数')
best_params = gsc.best_estimator_.get_params()
print(best_params)
for param_name in sorted(param_grid.keys()):
print(param_name,':',best_params[param_name])