1、准备
-
# 首先 import 必要的模块
-
import pandas
as pd
-
import numpy
as np
-
-
from sklearn.model_selection
import GridSearchCV
-
-
#竞赛的评价指标为logloss
-
from sklearn.metrics
import log_loss
-
-
from matplotlib
import pyplot
-
import seaborn
as sns
-
%matplotlib inline
-
-
data = pd.read_csv(
'Otto_train.csv')
-
data.head()
-
data.info()
-
data.describe()
-
data.shape
-
#受机器性能所限取前两万条数据
-
data = data[:
20000]
-
-
# Target 分布,看看各类样本分布是否均衡
-
sns.countplot(data.target)
-
pyplot.xlabel(
'target');
-
pyplot.ylabel(
'Number of occurrences');
2、数据标准化
-
# 将类别字符串变成数字
-
y_train = data.target
-
y_train = y_train.map(
lambda s:s[
6:])
-
y_train = y_train.map(
lambda s:int(s)
-1)
-
-
data = data.drop([
'target',
'id'],axis=
1)
-
X_train = np.array(data)
-
-
# 数据标准化
-
from sklearn.preprocessing
import StandardScaler
-
-
# 初始化特征的标准化器
-
ss_X = StandardScaler()
-
-
# 分别对训练和测试数据的特征进行标准化处理
-
X_train = ss_X.fit_transform(X_train)
-
-
from sklearn.linear_model
import LogisticRegression
-
from sklearn.cross_validation
import cross_val_score
-
lr= LogisticRegression()
-
# 交叉验证用于评估模型性能和进行参数调优(模型选择)
-
#分类任务中交叉验证缺省是采用StratifiedKFold
-
loss = cross_val_score(lr, X_train, y_train, cv=
5, scoring=
'neg_log_loss')
-
print(
'logloss of each fold is: ',-loss)
-
print(
'cv logloss is:', -loss.mean())
3、调用GridSearchCV进行参数调优
-
from sklearn.model_selection
import GridSearchCV
-
from sklearn.linear_model
import LogisticRegression
-
-
#需要调优的参数
-
# 请尝试将L1正则和L2正则分开,并配合合适的优化求解算法(slover)
-
#tuned_parameters = {'penalty':['l1','l2'],
-
# 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
-
# }
-
penaltys = [
'l1',
'l2']
-
Cs = [
0.001,
0.01,
0.1,
1,
10,
100,
1000]
-
tuned_parameters = dict(penalty = penaltys, C = Cs)
-
-
lr_penalty= LogisticRegression()
-
grid= GridSearchCV(lr_penalty, tuned_parameters,cv=
5, scoring=
'neg_log_loss')
-
grid.fit(X_train,y_train)
-
-
grid.cv_results_
-
-
print(-grid.best_score_)
-
print(grid.best_params_)
-
-
-
# 绘制plot CV误差曲线
-
test_means = grid.cv_results_[
'mean_test_score' ]
-
test_stds = grid.cv_results_[
'std_test_score' ]
-
train_means = grid.cv_results_[
'mean_train_score' ]
-
train_stds = grid.cv_results_[
'std_train_score' ]
-
-
-
# plot results
-
n_Cs = len(Cs)
-
number_penaltys = len(penaltys)
-
test_scores = np.array(test_means).reshape(n_Cs,number_penaltys)
-
train_scores = np.array(train_means).reshape(n_Cs,number_penaltys)
-
test_stds = np.array(test_stds).reshape(n_Cs,number_penaltys)
-
train_stds = np.array(train_stds).reshape(n_Cs,number_penaltys)
-
-
x_axis = np.log10(Cs)
-
for i, value
in enumerate(penaltys):
-
#pyplot.plot(log(Cs), test_scores[i], label= 'penalty:' + str(value))
-
pyplot.errorbar(x_axis, test_scores[:,i], yerr=test_stds[:,i] ,label = penaltys[i] +
' Test')
-
pyplot.errorbar(x_axis, train_scores[:,i], yerr=train_stds[:,i] ,label = penaltys[i] +
' Train')
-
-
pyplot.legend()
-
pyplot.xlabel(
'log(C)' )
-
pyplot.ylabel(
'neg-logloss' )
-
pyplot.savefig(
'LogisticGridSearchCV_C.png' )
-
-
pyplot.show()
<li class="tool-item tool-active is-like "><a href="javascript:;"><svg class="icon" aria-hidden="true"> <use xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#csdnc-thumbsup"></use> </svg><span class="name">点赞</span> <span class="count">1</span> </a></li> <li class="tool-item tool-active is-collection "><a href="javascript:;" data-report-click="{"mod":"popu_824"}"><svg class="icon" aria-hidden="true"> <use xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#icon-csdnc-Collection-G"></use> </svg><span class="name">收藏</span></a></li> <li class="tool-item tool-active is-share"><a href="javascript:;" data-report-click="{"mod":"1582594662_002"}"><svg class="icon" aria-hidden="true"> <use xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#icon-csdnc-fenxiang"></use> </svg>分享</a></li> <!--打赏开始--> <!--打赏结束--> <li class="tool-item tool-more"> <a> <svg t="1575545411852" class="icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" p-id="5717" xmlns:xlink="http://www.w3.org/1999/xlink" width="200" height="200"><defs><style type="text/css"></style></defs><path d="M179.176 499.222m-113.245 0a113.245 113.245 0 1 0 226.49 0 113.245 113.245 0 1 0-226.49 0Z" p-id="5718"></path><path d="M509.684 499.222m-113.245 0a113.245 113.245 0 1 0 226.49 0 113.245 113.245 0 1 0-226.49 0Z" p-id="5719"></path><path d="M846.175 499.222m-113.245 0a113.245 113.245 0 1 0 226.49 0 113.245 113.245 0 1 0-226.49 0Z" p-id="5720"></path></svg> </a> <ul class="more-box"> <li class="item"><a class="article-report">文章举报</a></li> </ul> </li> </ul> </div> </div> <div class="person-messagebox"> <div class="left-message"><a href="https://blog.csdn.net/evolution23"> <img src="https://profile.csdnimg.cn/2/3/3/3_evolution23" class="avatar_pic" username="evolution23"> <img src="https://g.csdnimg.cn/static/user-reg-year/1x/9.png" class="user-years"> </a></div> <div class="middle-message"> <div class="title"><span class="tit"><a href="https://blog.csdn.net/evolution23" data-report-click="{"mod":"popu_379"}" target="_blank">二月鳥</a></span> </div> <div class="text"><span>发布了19 篇原创文章</span> · <span>获赞 7</span> · <span>访问量 2万+</span></div> </div> <div class="right-message"> <a href="https://im.csdn.net/im/main.html?userName=evolution23" target="_blank" class="btn btn-sm btn-red-hollow bt-button personal-letter">私信 </a> <a class="btn btn-sm bt-button personal-watch" data-report-click="{"mod":"popu_379"}">关注</a> </div> </div> </div>