Logistic 回归—网格搜索最优参数笔记

最新推荐文章于 2023-06-27 09:23:09 发布

语亦情非

最新推荐文章于 2023-06-27 09:23:09 发布

阅读量1.4k

点赞数

分类专栏：机器学习数据挖掘面试

原文链接：https://blog.csdn.net/evolution23/article/details/85028361

版权

机器学习同时被 2 个专栏收录

25 篇文章 1 订阅

订阅专栏

数据挖掘面试

25 篇文章 1 订阅

订阅专栏

1、准备


 
 
   
   
    
    
   
   
   
   
    
    
     
     # 首先 import 必要的模块
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     import pandas 
     
     as pd 
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     import numpy 
     
     as np
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     from sklearn.model_selection 
     
     import GridSearchCV
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     #竞赛的评价指标为logloss
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     from sklearn.metrics 
     
     import log_loss  
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     from matplotlib 
     
     import pyplot
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     import seaborn 
     
     as sns
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     %matplotlib inline
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     data = pd.read_csv(
     
     'Otto_train.csv')
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     data.head()
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     data.info()
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     data.describe()
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     data.shape
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     #受机器性能所限取前两万条数据
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     data = data[:
     
     20000]
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     # Target 分布，看看各类样本分布是否均衡
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     sns.countplot(data.target)
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     pyplot.xlabel(
     
     'target');
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     pyplot.ylabel(
     
     'Number of occurrences');

2、数据标准化


 
 
   
   
    
    
   
   
   
   
    
    
     
     # 将类别字符串变成数字
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     y_train = data.target
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     y_train = y_train.map(
     
     lambda s:s[
     
     6:])
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     y_train = y_train.map(
     
     lambda s:int(s)
     
     -1)
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     data = data.drop([
     
     'target',
     
     'id'],axis=
     
     1)
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     X_train = np.array(data)
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     # 数据标准化
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     from sklearn.preprocessing 
     
     import StandardScaler
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     # 初始化特征的标准化器
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     ss_X = StandardScaler()
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     # 分别对训练和测试数据的特征进行标准化处理
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     X_train = ss_X.fit_transform(X_train)
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     from sklearn.linear_model 
     
     import LogisticRegression
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     from sklearn.cross_validation 
     
     import cross_val_score
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     lr= LogisticRegression()
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     # 交叉验证用于评估模型性能和进行参数调优（模型选择）
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     #分类任务中交叉验证缺省是采用StratifiedKFold
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     loss = cross_val_score(lr, X_train, y_train, cv=
     
     5, scoring=
     
     'neg_log_loss')
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     print(
     
     'logloss of each fold is: ',-loss)
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     print(
     
     'cv logloss is:', -loss.mean())

3、调用GridSearchCV进行参数调优


 
 
   
   
    
    
   
   
   
   
    
    
     
     from sklearn.model_selection 
     
     import GridSearchCV
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     from sklearn.linear_model 
     
     import LogisticRegression
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     #需要调优的参数
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     # 请尝试将L1正则和L2正则分开，并配合合适的优化求解算法（slover）
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     #tuned_parameters = {'penalty':['l1','l2'],
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     #                   'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     #                   }
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     penaltys = [
     
     'l1',
     
     'l2']
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     Cs = [
     
     0.001, 
     
     0.01, 
     
     0.1, 
     
     1, 
     
     10, 
     
     100, 
     
     1000]
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     tuned_parameters = dict(penalty = penaltys, C = Cs)
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     lr_penalty= LogisticRegression()
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     grid= GridSearchCV(lr_penalty, tuned_parameters,cv=
     
     5, scoring=
     
     'neg_log_loss')
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     grid.fit(X_train,y_train)
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     grid.cv_results_
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     print(-grid.best_score_)
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     print(grid.best_params_)
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     # 绘制plot CV误差曲线
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     test_means = grid.cv_results_[ 
     
     'mean_test_score' ]
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     test_stds = grid.cv_results_[ 
     
     'std_test_score' ]
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     train_means = grid.cv_results_[ 
     
     'mean_train_score' ]
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     train_stds = grid.cv_results_[ 
     
     'std_train_score' ]
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     # plot results
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     n_Cs = len(Cs)
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     number_penaltys = len(penaltys)
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     test_scores = np.array(test_means).reshape(n_Cs,number_penaltys)
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     train_scores = np.array(train_means).reshape(n_Cs,number_penaltys)
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     test_stds = np.array(test_stds).reshape(n_Cs,number_penaltys)
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     train_stds = np.array(train_stds).reshape(n_Cs,number_penaltys)
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     x_axis = np.log10(Cs)
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     for i, value 
     
     in enumerate(penaltys):
    
    
   
   

   
   
    
    
   
   
   
   
    
        
     
     #pyplot.plot(log(Cs), test_scores[i], label= 'penalty:'   + str(value))
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
         pyplot.errorbar(x_axis, test_scores[:,i], yerr=test_stds[:,i] ,label = penaltys[i] +
     
     ' Test')
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
         pyplot.errorbar(x_axis, train_scores[:,i], yerr=train_stds[:,i] ,label = penaltys[i] +
     
     ' Train')
    
    
   
   

   
   
    
    
   
   
   
   
    
        
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     pyplot.legend()
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     pyplot.xlabel( 
     
     'log(C)' )                                                                                                      
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     pyplot.ylabel( 
     
     'neg-logloss' )
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     pyplot.savefig(
     
     'LogisticGridSearchCV_C.png' )
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     pyplot.show()

                    <li class="tool-item tool-active is-like "><a href="javascript:;"><svg class="icon" aria-hidden="true">
                        <use xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#csdnc-thumbsup"></use>
                    </svg><span class="name">点赞</span>
                    <span class="count">1</span>
                    </a></li>
                    <li class="tool-item tool-active is-collection "><a href="javascript:;" data-report-click="{&quot;mod&quot;:&quot;popu_824&quot;}"><svg class="icon" aria-hidden="true">
                        <use xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#icon-csdnc-Collection-G"></use>
                    </svg><span class="name">收藏</span></a></li>
                    <li class="tool-item tool-active is-share"><a href="javascript:;" data-report-click="{&quot;mod&quot;:&quot;1582594662_002&quot;}"><svg class="icon" aria-hidden="true">
                        <use xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#icon-csdnc-fenxiang"></use>
                    </svg>分享</a></li>
                    <!--打赏开始-->
                                            <!--打赏结束-->
                                            <li class="tool-item tool-more">
                        <a>
                        <svg t="1575545411852" class="icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" p-id="5717" xmlns:xlink="http://www.w3.org/1999/xlink" width="200" height="200"><defs><style type="text/css"></style></defs><path d="M179.176 499.222m-113.245 0a113.245 113.245 0 1 0 226.49 0 113.245 113.245 0 1 0-226.49 0Z" p-id="5718"></path><path d="M509.684 499.222m-113.245 0a113.245 113.245 0 1 0 226.49 0 113.245 113.245 0 1 0-226.49 0Z" p-id="5719"></path><path d="M846.175 499.222m-113.245 0a113.245 113.245 0 1 0 226.49 0 113.245 113.245 0 1 0-226.49 0Z" p-id="5720"></path></svg>
                        </a>
                        <ul class="more-box">
                            <li class="item"><a class="article-report">文章举报</a></li>
                        </ul>
                    </li>
                                        </ul>
            </div>
                        </div>
        <div class="person-messagebox">
            <div class="left-message"><a href="https://blog.csdn.net/evolution23">
                <img src="https://profile.csdnimg.cn/2/3/3/3_evolution23" class="avatar_pic" username="evolution23">
                                        <img src="https://g.csdnimg.cn/static/user-reg-year/1x/9.png" class="user-years">
                                </a></div>
            <div class="middle-message">
                                    <div class="title"><span class="tit"><a href="https://blog.csdn.net/evolution23" data-report-click="{&quot;mod&quot;:&quot;popu_379&quot;}" target="_blank">二月鳥</a></span>
                                        </div>
                <div class="text"><span>发布了19 篇原创文章</span> · <span>获赞 7</span> · <span>访问量 2万+</span></div>
            </div>
                            <div class="right-message">
                                        <a href="https://im.csdn.net/im/main.html?userName=evolution23" target="_blank" class="btn btn-sm btn-red-hollow bt-button personal-letter">私信
                    </a>
                                                        <a class="btn btn-sm  bt-button personal-watch" data-report-click="{&quot;mod&quot;:&quot;popu_379&quot;}">关注</a>
                                </div>
                        </div>
                </div>

语亦情非

关注

0
点赞
踩
5

收藏

觉得还不错? 一键收藏
0
评论
Logistic 回归—网格搜索最优参数笔记

1、准备# 首先 import 必要的模块import pandas as pd import numpy as np from sklearn.model_selection import...
复制链接

扫一扫

专栏目录