task4 掌握回归模型的评估及超参数调优

!pip install numpy
Requirement already satisfied: numpy in c:\users\lenovo\anaconda3\lib\site-packages (1.19.2)
!pip install pandas
Requirement already satisfied: pandas in c:\users\lenovo\anaconda3\lib\site-packages (1.1.3)
Requirement already satisfied: python-dateutil>=2.7.3 in c:\users\lenovo\anaconda3\lib\site-packages (from pandas) (2.8.1)
Requirement already satisfied: numpy>=1.15.4 in c:\users\lenovo\anaconda3\lib\site-packages (from pandas) (1.19.2)
Requirement already satisfied: pytz>=2017.2 in c:\users\lenovo\anaconda3\lib\site-packages (from pandas) (2020.1)
Requirement already satisfied: six>=1.5 in c:\users\lenovo\anaconda3\lib\site-packages (from python-dateutil>=2.7.3->pandas) (1.15.0)
!pip install matplotlib
Requirement already satisfied: matplotlib in c:\users\lenovo\anaconda3\lib\site-packages (3.3.2)
Requirement already satisfied: cycler>=0.10 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib) (0.10.0)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib) (2.4.7)
Requirement already satisfied: pillow>=6.2.0 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib) (8.0.1)
Requirement already satisfied: certifi>=2020.06.20 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib) (2020.6.20)
Requirement already satisfied: numpy>=1.15 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib) (1.19.2)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib) (1.3.0)
Requirement already satisfied: python-dateutil>=2.1 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib) (2.8.1)
Requirement already satisfied: six in c:\users\lenovo\anaconda3\lib\site-packages (from cycler>=0.10->matplotlib) (1.15.0)
# 引入相关科学计算包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("ggplot") 
import seaborn as sns
from sklearn import datasets
boston = datasets.load_boston() # 返回一个类似于字典的类
X = boston.data
y = boston.target
features = boston.feature_names
boston_data = pd.DataFrame(X,columns=features)
boston_data["Price"] = y
boston_data.head()
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTATPrice
00.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.904.9824.0
10.027310.07.070.00.4696.42178.94.96712.0242.017.8396.909.1421.6
20.027290.07.070.00.4697.18561.14.96712.0242.017.8392.834.0334.7
30.032370.02.180.00.4586.99845.86.06223.0222.018.7394.632.9433.4
40.069050.02.180.00.4587.14754.26.06223.0222.018.7396.905.3336.2
sns.scatterplot(boston_data['NOX'],boston_data['Price'],color="r",alpha=0.6)
plt.title("Price~NOX")
plt.show()
C:\Users\lenovo\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-K8BUe6GK-1616577548323)(output_5_1.png)]

from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target
features = iris.feature_names
iris_data = pd.DataFrame(X,columns=features)
iris_data['target'] = y
iris_data.head()

sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)target
05.13.51.40.20
14.93.01.40.20
24.73.21.30.20
34.63.11.50.20
45.03.61.40.20
# 可视化特征
marker = ['s','x','o']
for index,c in enumerate(np.unique(y)):
    plt.scatter(x=iris_data.loc[y==c,"sepal length (cm)"],y=iris_data.loc[y==c,"sepal width (cm)"],alpha=0.9,label=c,marker=marker[c])
plt.xlabel("sepal length (cm)")
plt.ylabel("sepal width (cm)")
plt.legend()
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-NSwUJolW-1616577548326)(output_7_0.png)]

# 生成月牙型非凸集
from sklearn import datasets
x, y = datasets.make_moons(n_samples=4000, shuffle=True,
                  noise=0.05, random_state=None)
for index,c in enumerate(np.unique(y)):
    plt.scatter(x[y==c,0],x[y==c,1],s=7)
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-y1F53mnY-1616577548328)(output_8_0.png)]

from sklearn import datasets
x, y = datasets.make_blobs(n_samples=5000, n_features=2, centers=3)
for index,c in enumerate(np.unique(y)):
    plt.scatter(x[y==c,0],x[y==c,1],s=7)
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Tq4trmmQ-1616577548331)(output_9_0.png)]

# 生成符合正态分布的聚类数据
from sklearn import datasets
x, y = datasets.make_blobs(n_samples=1000, n_features=2, centers=4)
for index,c in enumerate(np.unique(y)):
    plt.scatter(x[y==c, 1], x[y==c, 0],s=7)
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-OKdX5YfH-1616577548332)(output_10_0.png)]

from sklearn import datasets
boston=datasets.load_boston()
X=boston.data
y=boston.target
features=boston.feature_names
boston_data=pd.DataFrame(X,columns=features)
boston_data["Price"]=y
boston_data.head()
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTATPrice
00.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.904.9824.0
10.027310.07.070.00.4696.42178.94.96712.0242.017.8396.909.1421.6
20.027290.07.070.00.4697.18561.14.96712.0242.017.8392.834.0334.7
30.032370.02.180.00.4586.99845.86.06223.0222.018.7394.632.9433.4
40.069050.02.180.00.4587.14754.26.06223.0222.018.7396.905.3336.2
from sklearn import linear_model      # 引入线性回归方法
lin_reg = linear_model.LinearRegression()       # 创建线性回归的类
lin_reg.fit(X,y)        # 输入特征X和因变量y进行训练
print("模型系数:",lin_reg.coef_)             # 输出模型的系数
print("模型得分:",lin_reg.score(X,y))    # 输出模型的决定系数R^2
模型系数: [-1.08011358e-01  4.64204584e-02  2.05586264e-02  2.68673382e+00
 -1.77666112e+01  3.80986521e+00  6.92224640e-04 -1.47556685e+00
  3.06049479e-01 -1.23345939e-02 -9.52747232e-01  9.31168327e-03
 -5.24758378e-01]
模型得分: 0.7406426641094095
from sklearn.preprocessing import PolynomialFeatures
X_arr = np.arange(6).reshape(3, 2)
print("原始X为:\n",X_arr)

poly = PolynomialFeatures(2)
print("2次转化X:\n",poly.fit_transform(X_arr))

poly = PolynomialFeatures(interaction_only=True)
print("2次转化X:\n",poly.fit_transform(X_arr))
原始X为:
 [[0 1]
 [2 3]
 [4 5]]
2次转化X:
 [[ 1.  0.  1.  0.  0.  1.]
 [ 1.  2.  3.  4.  6.  9.]
 [ 1.  4.  5. 16. 20. 25.]]
2次转化X:
 [[ 1.  0.  1.  0.]
 [ 1.  2.  3.  6.]
 [ 1.  4.  5. 20.]]
!pip install pygam
Collecting pygam
  Downloading pygam-0.8.0-py2.py3-none-any.whl (1.8 MB)
Collecting progressbar2
  Downloading progressbar2-3.53.1-py2.py3-none-any.whl (25 kB)
Requirement already satisfied: scipy in c:\users\lenovo\anaconda3\lib\site-packages (from pygam) (1.5.2)
Requirement already satisfied: future in c:\users\lenovo\anaconda3\lib\site-packages (from pygam) (0.18.2)
Requirement already satisfied: numpy in c:\users\lenovo\anaconda3\lib\site-packages (from pygam) (1.19.2)
Collecting python-utils>=2.3.0
  Downloading python_utils-2.5.6-py2.py3-none-any.whl (12 kB)
Requirement already satisfied: six in c:\users\lenovo\anaconda3\lib\site-packages (from progressbar2->pygam) (1.15.0)
Installing collected packages: python-utils, progressbar2, pygam
Successfully installed progressbar2-3.53.1 pygam-0.8.0 python-utils-2.5.6
from pygam import LinearGAM
gam = LinearGAM().fit(boston_data[boston.feature_names], y)
gam.summary()
LinearGAM                                                                                                 
=============================================== ==========================================================
Distribution:                        NormalDist Effective DoF:                                    103.2423
Link Function:                     IdentityLink Log Likelihood:                                 -1589.7653
Number of Samples:                          506 AIC:                                             3388.0152
                                                AICc:                                            3442.7649
                                                GCV:                                               13.7683
                                                Scale:                                              8.8269
                                                Pseudo R-Squared:                                   0.9168
==========================================================================================================
Feature Function                  Lambda               Rank         EDoF         P > x        Sig. Code   
================================= ==================== ============ ============ ============ ============
s(0)                              [0.6]                20           11.1         2.20e-11     ***         
s(1)                              [0.6]                20           12.8         8.15e-02     .           
s(2)                              [0.6]                20           13.5         2.59e-03     **          
s(3)                              [0.6]                20           3.8          2.76e-01                 
s(4)                              [0.6]                20           11.4         1.11e-16     ***         
s(5)                              [0.6]                20           10.1         1.11e-16     ***         
s(6)                              [0.6]                20           10.4         8.22e-01                 
s(7)                              [0.6]                20           8.5          4.44e-16     ***         
s(8)                              [0.6]                20           3.5          5.96e-03     **          
s(9)                              [0.6]                20           3.4          1.33e-09     ***         
s(10)                             [0.6]                20           1.8          3.26e-03     **          
s(11)                             [0.6]                20           6.4          6.25e-02     .           
s(12)                             [0.6]                20           6.5          1.11e-16     ***         
intercept                                              1            0.0          2.23e-13     ***         
==========================================================================================================
Significance codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

WARNING: Fitting splines and a linear function to a feature introduces a model identifiability problem
         which can cause p-values to appear significant when they are not.

WARNING: p-values calculated in this manner behave correctly for un-penalized models or models with
         known smoothing parameters, but when smoothing parameters have been estimated, the p-values
         are typically lower than they should be, meaning that the tests reject the null too readily.


<ipython-input-77-bb049c74476f>:3: UserWarning: KNOWN BUG: p-values computed in this summary are likely much smaller than they should be. 
 
Please do not make inferences based on these values! 

Collaborate on a solution, and stay up to date at: 
github.com/dswah/pyGAM/issues/163 

  gam.summary()
from sklearn.tree import DecisionTreeRegressor    
reg_tree = DecisionTreeRegressor(criterion = "mse",min_samples_leaf = 5)
reg_tree.fit(X,y)
reg_tree.score(X,y)
0.9376307599929274
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler     # 标准化数据
from sklearn.pipeline import make_pipeline   # 使用管道,把预处理和模型形成一个流程

reg_svr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
reg_svr.fit(X, y)
reg_svr.score(X,y)
0.7024525421955277
#定义向前逐步回归函数
def forward_select(data,target):
    variate=set(data.columns)  #将字段名转换成字典类型
    variate.remove(target)  #去掉因变量的字段名
    selected=[]
    current_score,best_new_score=float('inf'),float('inf')  #目前的分数和最好分数初始值都为无穷大(因为AIC越小越好)
    #循环筛选变量
    while variate:
        aic_with_variate=[]
        for candidate in variate:  #逐个遍历自变量
            formula="{}~{}".format(target,"+".join(selected+[candidate]))  #将自变量名连接起来
            aic=ols(formula=formula,data=data).fit().aic  #利用ols训练模型得出aic值
            aic_with_variate.append((aic,candidate))  #将第每一次的aic值放进空列表
        aic_with_variate.sort(reverse=True)  #降序排序aic值
        best_new_score,best_candidate=aic_with_variate.pop()  #最好的aic值等于删除列表的最后一个值,以及最好的自变量等于列表最后一个自变量
        if current_score>best_new_score:  #如果目前的aic值大于最好的aic值
            variate.remove(best_candidate)  #移除加进来的变量名,即第二次循环时,不考虑此自变量了
            selected.append(best_candidate)  #将此自变量作为加进模型中的自变量
            current_score=best_new_score  #最新的分数等于最好的分数
            print("aic is {},continuing!".format(current_score))  #输出最小的aic值
        else:
            print("for selection over!")
            break
    formula="{}~{}".format(target,"+".join(selected))  #最终的模型式子
    print("final formula is {}".format(formula))
    model=ols(formula=formula,data=data).fit()
    return(model)
import statsmodels.api as sm #最小二乘
from statsmodels.formula.api import ols #加载ols模型
forward_select(data=boston_data,target="Price")
aic is 3286.974956900157,continuing!
aic is 3171.5423142992013,continuing!
aic is 3114.0972674193326,continuing!
aic is 3097.359044862759,continuing!
aic is 3069.438633167217,continuing!
aic is 3057.9390497191152,continuing!
aic is 3048.438382711162,continuing!
aic is 3042.274993098419,continuing!
aic is 3040.154562175143,continuing!
aic is 3032.0687017003256,continuing!
aic is 3021.726387825062,continuing!
for selection over!
final formula is Price~LSTAT+RM+PTRATIO+DIS+NOX+CHAS+B+ZN+CRIM+RAD+TAX





<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x17ffbd96ca0>
lm=ols("Price~LSTAT+RM+PTRATIO+DIS+NOX+CHAS+B+ZN+CRIM+RAD+TAX",data=boston_data).fit()
lm.summary()
OLS Regression Results
Dep. Variable:Price R-squared: 0.741
Model:OLS Adj. R-squared: 0.735
Method:Least Squares F-statistic: 128.2
Date:Wed, 24 Mar 2021 Prob (F-statistic):5.54e-137
Time:14:16:26 Log-Likelihood: -1498.9
No. Observations: 506 AIC: 3022.
Df Residuals: 494 BIC: 3072.
Df Model: 11
Covariance Type:nonrobust
coefstd errtP>|t|[0.0250.975]
Intercept 36.3411 5.067 7.171 0.000 26.385 46.298
LSTAT -0.5226 0.047 -11.019 0.000 -0.616 -0.429
RM 3.8016 0.406 9.356 0.000 3.003 4.600
PTRATIO -0.9465 0.129 -7.334 0.000 -1.200 -0.693
DIS -1.4927 0.186 -8.037 0.000 -1.858 -1.128
NOX -17.3760 3.535 -4.915 0.000 -24.322 -10.430
CHAS 2.7187 0.854 3.183 0.002 1.040 4.397
B 0.0093 0.003 3.475 0.001 0.004 0.015
ZN 0.0458 0.014 3.390 0.001 0.019 0.072
CRIM -0.1084 0.033 -3.307 0.001 -0.173 -0.044
RAD 0.2996 0.063 4.726 0.000 0.175 0.424
TAX -0.0118 0.003 -3.493 0.001 -0.018 -0.005
Omnibus:178.430 Durbin-Watson: 1.078
Prob(Omnibus): 0.000 Jarque-Bera (JB): 787.785
Skew: 1.523 Prob(JB): 8.60e-172
Kurtosis: 8.300 Cond. No. 1.47e+04


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.47e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
from sklearn import linear_model
reg_rid = linear_model.Ridge(alpha=.5)
reg_rid.fit(X,y)
reg_rid.score(X,y)
0.739957023371629
# 我们先来对未调参的SVR进行评价: 
from sklearn.svm import SVR     # 引入SVR类
from sklearn.pipeline import make_pipeline   # 引入管道简化学习流程
from sklearn.preprocessing import StandardScaler # 由于SVR基于距离计算,引入对数据进行标准化的类
from sklearn.model_selection import GridSearchCV  # 引入网格搜索调优
from sklearn.model_selection import cross_val_score # 引入K折交叉验证
from sklearn import datasets


boston = datasets.load_boston()     # 返回一个类似于字典的类
X = boston.data
y = boston.target
features = boston.feature_names
pipe_SVR = make_pipeline(StandardScaler(),
                                                         SVR())
score1 = cross_val_score(estimator=pipe_SVR,
                                                     X = X,
                                                     y = y,
                                                     scoring = 'r2',
                                                      cv = 10)       # 10折交叉验证
print("CV accuracy: %.3f +/- %.3f" % ((np.mean(score1)),np.std(score1)))
CV accuracy: 0.187 +/- 0.649
# 下面我们使用网格搜索来对SVR调参:
from sklearn.pipeline import Pipeline
pipe_svr = Pipeline([("StandardScaler",StandardScaler()),
                                                         ("svr",SVR())])
param_range = [0.0001,0.001,0.01,0.1,1.0,10.0,100.0,1000.0]
param_grid = [{"svr__C":param_range,"svr__kernel":["linear"]},  # 注意__是指两个下划线,一个下划线会报错的
                            {"svr__C":param_range,"svr__gamma":param_range,"svr__kernel":["rbf"]}]
gs = GridSearchCV(estimator=pipe_svr,
                                                     param_grid = param_grid,
                                                     scoring = 'r2',
                                                      cv = 10)       # 10折交叉验证
gs = gs.fit(X,y)
print("网格搜索最优得分:",gs.best_score_)
print("网格搜索最优参数组合:\n",gs.best_params_)
网格搜索最优得分: 0.6081303070817127
网格搜索最优参数组合:
 {'svr__C': 1000.0, 'svr__gamma': 0.001, 'svr__kernel': 'rbf'}
# 下面我们使用随机搜索来对SVR调参:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform  # 引入均匀分布设置参数
pipe_svr = Pipeline([("StandardScaler",StandardScaler()),
                                                         ("svr",SVR())])
distributions = dict(svr__C=uniform(loc=1.0, scale=4),    # 构建连续参数的分布
                     svr__kernel=["linear","rbf"],                                   # 离散参数的集合
                    svr__gamma=uniform(loc=0, scale=4))

rs = RandomizedSearchCV(estimator=pipe_svr,
                                                     param_distributions = distributions,
                                                     scoring = 'r2',
                                                      cv = 10)       # 10折交叉验证
rs = rs.fit(X,y)
print("随机搜索最优得分:",rs.best_score_)
print("随机搜索最优参数组合:\n",rs.best_params_)
随机搜索最优得分: 0.29880287171934033
随机搜索最优参数组合:
 {'svr__C': 3.1000343983291017, 'svr__gamma': 0.7023437908841612, 'svr__kernel': 'linear'}
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target
feature = iris.feature_names
data = pd.DataFrame(X,columns=feature)
data['target'] = y
data.head()
sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)target
05.13.51.40.20
14.93.01.40.20
24.73.21.30.20
34.63.11.50.20
45.03.61.40.20
#  逻辑回归
'''
penalty       {‘l1’, ‘l2’, ‘elasticnet’, ‘none’}, default=’l2’正则化方式
dual      bool, default=False   是否使用对偶形式,当n_samples> n_features时,默认dual = False。   
C        float, default=1.0      
solver       {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default=’lbfgs’     
l1_ratio         float, default=None           
'''
from sklearn.linear_model import LogisticRegression
log_iris = LogisticRegression()
log_iris.fit(X,y)
log_iris.score(X,y)
C:\Users\lenovo\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:762: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(





0.9733333333333334
# 线性判别分析
'''
参数:
solver:{'svd','lsqr','eigen'},默认='svd'
solver的使用,可能的值:
'svd':奇异值分解(默认)。不计算协方差矩阵,因此建议将此求解器用于具有大量特征的数据。

'lsqr':最小二乘解,可以与收缩结合使用。

'eigen':特征值分解,可以与收缩结合使用。
'''
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda_iris = LinearDiscriminantAnalysis()
lda_iris.fit(X,y)
lda_iris.score(X,y)
   
0.98
# 朴素贝叶斯             
from sklearn.naive_bayes import GaussianNB
NB_iris = GaussianNB()
NB_iris.fit(X, y)
NB_iris.score(X,y)
0.96
# 使用决策树算法对iris分类:
'''
criterion:{“gini”, “entropy”}, default=”gini”
max_depth:树的最大深度。
min_samples_split:拆分内部节点所需的最少样本数
min_samples_leaf :在叶节点处需要的最小样本数。

'''
from sklearn.tree import DecisionTreeClassifier
tree_iris = DecisionTreeClassifier(min_samples_leaf=5)
tree_iris.fit(X,y)
tree_iris.score(X,y)
0.9733333333333334
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
'''
C:正则化参数。正则化的强度与C成反比。必须严格为正。惩罚是平方的l2惩罚。
kernel:{'linear','poly','rbf','sigmoid','precomputed'},默认='rbf'
degree:多项式和的阶数
gamma:“ rbf”,“ poly”和“ Sigmoid”的内核系数。
shrinking:是否软间隔分类,默认true

'''
svc_iris = make_pipeline(StandardScaler(), SVC(gamma='auto'))
svc_iris.fit(X, y)
svc_iris.score(X,y)
0.9733333333333334
# 使用网格搜索进行超参数调优:
# 方式1:网格搜索GridSearchCV()
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import time

start_time = time.time()
pipe_svc = make_pipeline(StandardScaler(),SVC(random_state=1))
param_range = [0.0001,0.001,0.01,0.1,1.0,10.0,100.0,1000.0]
param_grid = [{'svc__C':param_range,'svc__kernel':['linear']},{'svc__C':param_range,'svc__gamma':param_range,'svc__kernel':['rbf']}]
gs = GridSearchCV(estimator=pipe_svc,param_grid=param_grid,scoring='accuracy',cv=10,n_jobs=-1)
gs = gs.fit(X,y)
end_time = time.time()
print("网格搜索经历时间:%.3f S" % float(end_time-start_time))
print(gs.best_score_)
print(gs.best_params_)
网格搜索经历时间:9.560 S
0.9800000000000001
{'svc__C': 1.0, 'svc__gamma': 0.1, 'svc__kernel': 'rbf'}
# 方式2:随机网格搜索RandomizedSearchCV()
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
import time

start_time = time.time()
pipe_svc = make_pipeline(StandardScaler(),SVC(random_state=1))
param_range = [0.0001,0.001,0.01,0.1,1.0,10.0,100.0,1000.0]
param_grid = [{'svc__C':param_range,'svc__kernel':['linear']},{'svc__C':param_range,'svc__gamma':param_range,'svc__kernel':['rbf']}]
# param_grid = [{'svc__C':param_range,'svc__kernel':['linear','rbf'],'svc__gamma':param_range}]
gs = RandomizedSearchCV(estimator=pipe_svc, param_distributions=param_grid,scoring='accuracy',cv=10,n_jobs=-1)
gs = gs.fit(X,y)
end_time = time.time()
print("随机网格搜索经历时间:%.3f S" % float(end_time-start_time))
print(gs.best_score_)
print(gs.best_params_)

随机网格搜索经历时间:0.135 S
0.9733333333333334
{'svc__kernel': 'rbf', 'svc__gamma': 0.0001, 'svc__C': 1000.0}
# 混淆矩阵:
# 加载数据
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data",header=None)
'''
乳腺癌数据集:569个恶性和良性肿瘤细胞的样本,M为恶性,B为良性
'''
# 做基本的数据预处理
from sklearn.preprocessing import LabelEncoder

X = df.iloc[:,2:].values
y = df.iloc[:,1].values
le = LabelEncoder()    #将M-B等字符串编码成计算机能识别的0-1
y = le.fit_transform(y)
le.transform(['M','B'])
# 数据切分8:2
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=1)
from sklearn.svm import SVC
pipe_svc = make_pipeline(StandardScaler(),SVC(random_state=1))
from sklearn.metrics import confusion_matrix

pipe_svc.fit(X_train,y_train)
y_pred = pipe_svc.predict(X_test)
confmat = confusion_matrix(y_true=y_test,y_pred=y_pred)
fig,ax = plt.subplots(figsize=(2.5,2.5))
ax.matshow(confmat, cmap=plt.cm.Blues,alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j,y=i,s=confmat[i,j],va='center',ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-izD63mN8-1616577548334)(output_33_0.png)]

# 绘制ROC曲线:
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import make_scorer,f1_score
scorer = make_scorer(f1_score,pos_label=0)
gs = GridSearchCV(estimator=pipe_svc,param_grid=param_grid,scoring=scorer,cv=10)
y_pred = gs.fit(X_train,y_train).decision_function(X_test)
#y_pred = gs.predict(X_test)
fpr,tpr,threshold = roc_curve(y_test, y_pred) ###计算真阳率和假阳率
roc_auc = auc(fpr,tpr) ###计算auc的值
plt.figure()
lw = 2
plt.figure(figsize=(7,5))
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) ###假阳率为横坐标,真阳率为纵坐标做曲线
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([-0.05, 1.0])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic ')
plt.legend(loc="lower right")
plt.show()

<Figure size 432x288 with 0 Axes>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-QIbjMlnU-1616577548336)(output_34_1.png)]


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值