1、准备
# 首先 import 必要的模块
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
#竞赛的评价指标为logloss
from sklearn.metrics import log_loss
from matplotlib import pyplot
import seaborn as sns
%matplotlib inline
data = pd.read_csv('Otto_train.csv')
data.head()
data.info()
data.describe()
data.shape
#受机器性能所限取前两万条数据
data = data[:20000]
# Target 分布,看看各类样本分布是否均衡
sns.countplot(data.target)
pyplot.xlabel('target');
pyplot.ylabel('Number of occurrences');
2、数据的标准化
# 将类别字符串变成数字
y_train = data.target
y_train = y_train.map(lambda s:s[6:])
y_train = y_train.map(lambda s:int(s)-1)
data = data.drop(['target','id'],axis=1)
X_train = np.array(data)
# 数据标准化
from sklearn.preprocessing import StandardScaler
# 初始化特征的标准化器
ss_X = StandardScaler()
# 分别对训练和测试数据的特征进行标准化处理
X_train = ss_X.fit_transform(X_train)
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
lr= LogisticRegression()
# 交叉验证用于评估模型性能和进行参数调优(模型选择)
#分类任务中交叉验证缺省是采用StratifiedKFold
loss = cross_val_score(lr, X_train, y_train, cv=5, scoring='neg_log_loss')
print('logloss of each fold is: ',-loss)
print('cv logloss is:', -loss.mean())
3、用LogisticRegressionCV的L1正则
from sklearn.linear_model import LogisticRegressionCV
Cs = [1, 10,100,1000]
# 大量样本(6W+)、高维度(93),L1正则 --> 可选用saga优化求解器(0.19版本新功能)
# LogisticRegressionCV比GridSearchCV快
lrcv_L1 = LogisticRegressionCV(Cs=Cs, cv = 5, scoring='neg_log_loss', penalty='l1', solver='liblinear', multi_class='ovr')
lrcv_L1.fit(X_train, y_train)
LogisticRegressionCV(Cs=[1, 10, 100, 1000], class_weight=None, cv=5,
dual=False, fit_intercept=True, intercept_scaling=1.0,
max_iter=100, multi_class='ovr', n_jobs=1, penalty='l1',
random_state=None, refit=True, scoring='neg_log_loss',
solver='liblinear', tol=0.0001, verbose=0)
lrcv_L1.scores_
# scores_:dict with classes as the keys, and the values as the grid of scores obtained during cross-validating each fold,
# Each dict value has shape (n_folds, len(Cs))
n_Cs = len(Cs)
n_classes = 3
scores = np.zeros((n_classes,n_Cs))
for j in range(n_classes):
scores[j][:] = np.mean(lrcv_L1.scores_[j],axis = 0)
mse_mean = -np.mean(scores, axis = 0)
pyplot.plot(np.log10(Cs), mse_mean.reshape(n_Cs,1))
#plt.plot(np.log10(reg.Cs)*np.ones(3), [0.28, 0.29, 0.30])
pyplot.xlabel('log(C)')
pyplot.ylabel('neg-logloss')
pyplot.show()
#print ('C is:',lr_cv.C_) #对多类分类问题,每个类别的分类器有一个C
lrcv_L1.coef_
4、用LogisticRegressionCV的L2正则
from sklearn.linear_model import LogisticRegressionCV
Cs = [1, 10,100,1000]
# 大量样本(6W+)、高维度(93),L2正则 --> 缺省用lbfgs,为了和GridSeachCV比较,也用liblinear
lr_cv_L2 = LogisticRegressionCV(Cs=Cs, cv = 5, scoring='neg_log_loss', penalty='l2', solver='liblinear', multi_class='ovr')
lr_cv_L2.fit(X_train, y_train)
lr_cv_L2.scores_
# dict with classes as the keys, and the values as the grid of scores obtained during cross-validating each fold,
# Each dict value has shape (n_folds, len(Cs))
n_Cs = len(Cs)
n_classes = 3
scores = np.zeros((n_classes,n_Cs))
for j in range(n_classes):
scores[j][:] = np.mean(lr_cv_L2.scores_[j],axis = 0)
mse_mean = -np.mean(scores, axis = 0)
pyplot.plot(np.log10(Cs), mse_mean.reshape(n_Cs,1))
#plt.plot(np.log10(reg.Cs)*np.ones(3), [0.28, 0.29, 0.30])
pyplot.xlabel('log(C)')
pyplot.ylabel('neg-logloss')
pyplot.show()
#print ('C is:',lr_cv.C_) #对多类分类问题,每个类别的分类器有一个C
from sklearn.linear_model import LogisticRegressionCV
Cs = [1, 10,100,1000]
# 大量样本(6W+)、高维度(93),L2正则 --> 缺省用lbfgs
# LogisticRegressionCV比GridSearchCV快
lrcv_L2 = LogisticRegressionCV(Cs=Cs, cv = 5, scoring='neg_log_loss', penalty='l2', multi_class='ovr')
lrcv_L2.fit(X_train, y_train)
lrcv_L2.scores_
# dict with classes as the keys, and the values as the grid of scores obtained during cross-validating each fold,
# Each dict value has shape (n_folds, len(Cs))
n_Cs = len(Cs)
n_classes = 3
scores = np.zeros((n_classes,n_Cs))
for j in range(n_classes):
scores[j][:] = np.mean(lrcv_L2.scores_[j],axis = 0)
mse_mean = -np.mean(scores, axis = 0)
pyplot.plot(np.log10(Cs), mse_mean.reshape(n_Cs,1))
#plt.plot(np.log10(reg.Cs)*np.ones(3), [0.28, 0.29, 0.30])
pyplot.xlabel('log(C)')
pyplot.ylabel('neg-logloss')
pyplot.show()