Beginning
在参考kaggle后,模型的搭建选择Xboosting 模型,并且参考了Sandro的参数,后采用NDCG的搜索引擎指标来对输出结果进行排序。
引入使用的包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
import pickle
import datetime
import os
import seaborn as sns
from sklearn.preprocessing import *
%matplotlib inline
from sklearn.metrics import *
from sklearn.model_selection import *
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
导入已经进行完特征工程的数据:
#测试集
xtrain = pd.read_csv("Finish_train.csv",index_col=0)
#验证集
ytrain = pd.read_csv("Airbnb_ytrain_v2.csv", header=None)
对验证集不连续的变量进行labelencoder,转换为数字:
#一共有几个不同的变量
np.unique(ytrain.values)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
ytrain_le = le.fit_transform(ytrain.values)
划分数据集
这里标注一下,我电脑最多能跑出百分之30的数据,再多数据的话报错:MemoryError,暂时还没有得到解决方案,可能是模型算法堆叠层次比较高——CPU i7-7700K,内存16G:
n = int(xtrain.shape[0]*0.3)
print (n)
xtrain_new = xtrain.iloc[:n, :]
ytrain_new = ytrain_le[:n]
print (xtrain_new.shape)
print (ytrain_new.shape)
标注问题; 把float64换为32,仍然不行:
xtrain_new = xtrain_new.astype(np.float32)
StandardScaling the dataset(把特征数据标准化)
X_scaler = StandardScaler()
xtrain_new = X_scaler.fit_transform(xtrain_new)
print(xtrain_new)
Airbnb NDCG
这里参考Kaggle上的一个NDCG的包:
# From Kaggle Kernels
from sklearn.metrics import make_scorer
def dcg_score(y_true, y_score, k=5):
"""
y_true : array, shape = [n_samples]
Ground truth (true relevance labels).
y_score : array, shape = [n_samples, n_classes]
Predicted scores.
k : int
"""
order = np.argsort(y_score)[::-1]
y_true = np.take(y_true, order[:k])
#order = [3,2,1,7]
#y_true = [0,1,0,0]
#y_true =[0,0,1,0]
gain = 2 ** y_true - 1
discounts = np.log2(np.arange(len(y_true)) + 2)
return np.sum(gain / discounts)
# 0 doesn't count =>>>> only reduce 1
def ndcg_score(ground_truth, predictions, k=5):
"""
Parameters
----------
ground_truth : array, shape = [n_samples]
Ground truth (true labels represended as integers).
predictions : array, shape = [n_samples, n_classes]
Predicted probabilities.
k : int
Rank.
Example
-------
>>> ground_truth = [1, 0, 2]
>>> predictions = [[0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
>>> score = ndcg_score(ground_truth, predictions, k=2)
1.0
>>> predictions = [[0.9, 0.5, 0.8], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
>>> score = ndcg_score(ground_truth, predictions, k=2)
0.6666666666
"""
lb = LabelBinarizer()
lb.fit(range(len(predictions) + 1))
T = lb.transform(ground_truth)
#ground_truth = [1, 0, 2] =>>>>
#ground_truth : [[0, 1, 0], [1, 0, 0], [0, 0, 1]]
#predictions: [[0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
#zip =>>> take one vector from each batch
scores = []
# Iterate over each y_true and compute the DCG score
for y_true, y_score in zip(T, predictions):
actual = dcg_score(y_true, y_score, k)
best = dcg_score(y_true, y_true, k)
score = float(actual) / float(best)
scores.append(score)
return np.mean(scores)
# From Kaggle Kernels
from sklearn.metrics import make_scorer
def dcg_score(y_true, y_score, k=5):
order = np.argsort(y_score)[::-1]
y_true = np.take(y_true, order[:k])
gain = 2 ** y_true - 1
discounts = np.log2(np.arange(len(y_true)) + 2)
return np.sum(gain / discounts)
def ndcg_score(ground_truth, predictions, k=5):
lb = LabelBinarizer()
lb.fit(range(len(predictions) + 1))
T = lb.transform(ground_truth)
scores = []
for y_true, y_score in zip(T, predictions):
actual = dcg_score(y_true, y_score, k)
best = dcg_score(y_true, y_true, k)
score = float(actual) / float(best)
scores.append(score)
return np.mean(scores)
Xgboost 模型
import xgboost as xgb
def customized_eval(preds, dtrain):
labels = dtrain.get_label()
top = []
for i in range(preds.shape[0]):
top.append(np.argsort(preds[i])[::-1][:5])
mat = np.reshape(np.repeat(labels,np.shape(top)[1]) == np.array(top).ravel(),np.array(top).shape).astype(int)
score = np.mean(np.sum(mat/np.log2(np.arange(2, mat.shape[1] + 2)),axis = 1))
return 'ndcg5', score
LEARNING_RATE = 0.1
N_ESTIMATORS = 50
RANDOM_STATE = 2017
MAX_DEPTH = 9
# xgboost parameters
NUM_XGB = 200
params = {}
params['colsample_bytree'] = 0.6
params['max_depth'] = 6
params['subsample'] = 0.8
params['eta'] = 0.3
params['seed'] = RANDOM_STATE
params['num_class'] = 12
params['objective'] = 'multi:softprob' # output the probability instead of class.
print (params)
train_score_iter = []
cv_score_iter = []
kf = KFold(n_splits = 3, random_state=RANDOM_STATE)
k_ndcg = 5
for train_index, test_index in kf.split(xtrain_new, ytrain_new):
X_train, X_test = xtrain_new[train_index, :], xtrain_new[test_index, :]
y_train, y_test = ytrain_new[train_index], ytrain_new[test_index]
print (X_train.shape, X_test.shape)
train_xgb = xgb.DMatrix(X_train, label= y_train)
test_xgb = xgb.DMatrix(X_test, label = y_test)
watchlist = [ (train_xgb,'train'), (test_xgb, 'test') ]
bst = xgb.train(params,
train_xgb,
NUM_XGB,
watchlist,
feval = customized_eval,
verbose_eval = 3,
early_stopping_rounds = 5)
#bst = xgb.train( params, dtrain, num_round, evallist )
y_pred = np.array(bst.predict(test_xgb))
y_pred_train = np.array(bst.predict(train_xgb))
# for binary classification: we used to use f1 score, precision, recall, auc score.
# here for Airbnb we use the ndcg evaluation.
train_ndcg_score = ndcg_score(y_train, y_pred_train , k = k_ndcg)
cv_ndcg_score = ndcg_score(y_test, y_pred, k=k_ndcg)
train_score_iter.append(train_ndcg_score)
cv_score_iter.append(cv_ndcg_score)
train_score_xgb = np.mean(train_score_iter)
cv_score_xgb = np.mean(cv_score_iter)
print ("\nThe training score is: {}".format(train_score_xgb))
print ("The cv score is: {}\n".format(cv_score_xgb))