https://www.kaggle.com/happycube/bosch-production-line-performance/scirpus-extreme-bayes-faron-36312/code
说明:这个脚本是训练XGBoost的脚本,值得去学习,特别是XGBoost的一些参数设置
import gc
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import matthews_corrcoef
from operator import itemgetter
# per raddar, all date features except for stations 24+25 are identical
def get_date_features():
directory = '../input/'
trainfile = 'train_date.csv'
for i, chunk in enumerate(pd.read_csv(directory + trainfile,
chunksize=1,
low_memory=False)):
features = list(chunk.columns)
break
seen = np.zeros(52)
rv = []
for f in features:
if f == 'Id' or 'S24' in f or 'S25' in f:
rv.append(f)
continue
station = int(f.split('_')[1][1:])
# print(station)
if seen[station]:
continue
seen[station] = 1
rv.append(f)
return rv
usefuldatefeatures = get_date_features()
def get_mindate():
directory = '../input/'
trainfile = 'train_date.csv'
testfile = 'test_date.csv'
features = None
subset = None
for i, chunk in enumerate(pd.read_csv(directory + trainfile,
usecols=usefuldatefeatures,
chunksize=50000,
low_memory=False)):
print(i)
if features is None:
features = list(chunk.columns)
features.remove('Id')
df_mindate_chunk = chunk[['Id']].copy()
df_mindate_chunk['mindate'] = chunk[features].min(axis=1).values
if subset is None:
subset = df_mindate_chunk.copy()
else:
subset = pd.concat([subset, df_mindate_chunk])
del chunk
gc.collect()
for i, chunk in enumerate(pd.read_csv(directory + testfile,
usecols=usefuldatefeatures,
chunksize=50000,
low_memory=False)):
print(i)
df_mindate_chunk = chunk[['Id']].copy()
df_mindate_chunk['mindate'] = chunk[features].min(axis=1).values
subset = pd.concat([subset, df_mindate_chunk])
del chunk
gc.collect()
return subset
df_mindate = get_mindate()
df_mindate.sort_values(by=['mindate', 'Id'], inplace=True)
df_mindate['mindate_id_diff'] = df_mindate.Id.diff()
midr = np.full_like(df_mindate.mindate_id_diff.values, np.nan)
midr[0:-1] = -df_mindate.mindate_id_diff.values[1:]
df_mindate['mindate_id_diff_reverse'] = midr
def mcc(tp, tn, fp, fn):
sup = tp * tn - fp * fn
inf = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
if inf == 0:
return 0
else:
return sup / np.sqrt(inf)
def eval_mcc(y_true, y_prob, show=False):
idx = np.argsort(y_prob)
y_true_sort = y_true[idx]
n = y_true.shape[0]
nump = 1.0 * np.sum(y_true) # number of positive
numn = n - nump # number of negative
tp = nump
tn = 0.0
fp = numn
fn = 0.0
best_mcc = 0.0
best_id = -1
mccs = np.zeros(n)
for i in range(n):
if y_true_sort[i] == 1:
tp -= 1.0
fn += 1.0
else:
fp -= 1.0
tn += 1.0
new_mcc = mcc(tp, tn, fp, fn)
mccs[i] = new_mcc
if new_mcc >= best_mcc:
best_mcc = new_mcc
best_id = i
if show:
best_proba = y_prob[idx[best_id]]
y_pred = (y_prob > best_proba).astype(int)
return best_proba, best_mcc, y_pred
else:
return best_mcc
def mcc_eval(y_prob, dtrain):
y_true = dtrain.get_label()
best_mcc = eval_mcc(y_true, y_prob)
return 'MCC', best_mcc
def create_feature_map(features):
outfile = open('xgb.fmap', 'w')
for i, feat in enumerate(features):
outfile.write('{0}\t{1}\tq\n'.format(i, feat))
outfile.close()
def get_importance(gbm, features):
create_feature_map(features)
importance = gbm.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
return importance
def LeaveOneOut(data1, data2, columnName, useLOO=False):
grpOutcomes = data1.groupby(columnName)['Response'].mean().reset_index()
grpCount = data1.groupby(columnName)['Response'].count().reset_index()
grpOutcomes['cnt'] = grpCount.Response
if(useLOO):
grpOutcomes = grpOutcomes[grpOutcomes.cnt > 1]
grpOutcomes.drop('cnt', inplace=True, axis=1)
outcomes = data2['Response'].values
x = pd.merge(data2[[columnName, 'Response']], grpOutcomes,
suffixes=('x_', ''),
how='left',
on=columnName,
left_index=True)['Response']
if(useLOO):
x = ((x*x.shape[0])-outcomes)/(x.shape[0]-1)
# x = x + np.random.normal(0, .01, x.shape[0])
return x.fillna(x.mean())
def GrabData():
directory = '../input/'
trainfiles = ['train_categorical.csv',
'train_date.csv',
'train_numeric.csv']
testfiles = ['test_categorical.csv',
'test_date.csv',
'test_numeric.csv']
cols = [['Id',
'L1_S24_F1559', 'L3_S32_F3851',
'L1_S24_F1827', 'L1_S24_F1582',
'L3_S32_F3854', 'L1_S24_F1510',
'L1_S24_F1525'],
['Id',
'L3_S30_D3496', 'L3_S30_D3506',
'L3_S30_D3501', 'L3_S30_D3516',
'L3_S30_D3511'],
['Id',
'L1_S24_F1846', 'L3_S32_F3850',
'L1_S24_F1695', 'L1_S24_F1632',
'L3_S33_F3855', 'L1_S24_F1604',
'L3_S29_F3407', 'L3_S33_F3865',
'L3_S38_F3952', 'L1_S24_F1723',
'Response']]
traindata = None
testdata = None
for i, f in enumerate(trainfiles):
print(f)
subset = None
for i, chunk in enumerate(pd.read_csv(directory + f,
usecols=cols[i],
chunksize=50000,
low_memory=False)):
print(i)
if subset is None:
subset = chunk.copy()
else:
subset = pd.concat([subset, chunk])
del chunk
gc.collect()
if traindata is None:
traindata = subset.copy()
else:
traindata = pd.merge(traindata, subset.copy(), on="Id")
del subset
gc.collect()
del cols[2][-1] # Test doesn't have response!
for i, f in enumerate(testfiles):
print(f)
subset = None
for i, chunk in enumerate(pd.read_csv(directory + f,
usecols=cols[i],
chunksize=50000,
low_memory=False)):
print(i)
if subset is None:
subset = chunk.copy()
else:
subset = pd.concat([subset, chunk])
del chunk
gc.collect()
if testdata is None:
testdata = subset.copy()
else:
testdata = pd.merge(testdata, subset.copy(), on="Id")
del subset
gc.collect()
traindata = traindata.merge(df_mindate, on='Id')
testdata = testdata.merge(df_mindate, on='Id')
testdata['Response'] = 0 # Add Dummy Value
visibletraindata = traindata[::2]
blindtraindata = traindata[1::2]
print(blindtraindata.columns)
for i in range(2):
for col in cols[i][1:]:
print(col)
blindtraindata.loc[:, col] = LeaveOneOut(visibletraindata,
blindtraindata,
col, False).values
testdata.loc[:, col] = LeaveOneOut(visibletraindata,
testdata, col, False).values
del visibletraindata
gc.collect()
testdata.drop('Response', inplace=True, axis=1)
return blindtraindata, testdata
def Train():
train, test = GrabData()
print('Train:', train.shape)
print('Test', test.shape)
features = list(train.columns)
features.remove('Response')
features.remove('Id')
print(features)
num_rounds = 50
params = {}
params['objective'] = "binary:logistic"
params['eta'] = 0.021
params['max_depth'] = 7
params['colsample_bytree'] = 0.82
params['min_child_weight'] = 3
params['base_score'] = 0.005
params['silent'] = True
print('Fitting')
trainpredictions = None
testpredictions = None
dvisibletrain = \
xgb.DMatrix(train[features],
train.Response,
silent=True)
dtest = \
xgb.DMatrix(test[features],
silent=True)
folds = 1
for i in range(folds):
print('Fold:', i)
params['seed'] = i
watchlist = [(dvisibletrain, 'train'), (dvisibletrain, 'val')]
clf = xgb.train(params, dvisibletrain,
num_boost_round=num_rounds,
evals=watchlist,
early_stopping_rounds=20,
feval=mcc_eval,
maximize=True
)
limit = clf.best_iteration+1
# limit = clf.best_ntree_limit
predictions = \
clf.predict(dvisibletrain, ntree_limit=limit)
best_proba, best_mcc, y_pred = eval_mcc(train.Response,
predictions,
True)
print('tree limit:', limit)
print('mcc:', best_mcc)
print(matthews_corrcoef(train.Response,
y_pred))
if(trainpredictions is None):
trainpredictions = predictions
else:
trainpredictions += predictions
predictions = clf.predict(dtest, ntree_limit=limit)
if(testpredictions is None):
testpredictions = predictions
else:
testpredictions += predictions
imp = get_importance(clf, features)
print('Importance array: ', imp)
best_proba, best_mcc, y_pred = eval_mcc(train.Response,
trainpredictions/folds,
True)
print(matthews_corrcoef(train.Response,
y_pred))
submission = pd.DataFrame({"Id": train.Id,
"Prediction": trainpredictions/folds,
"Response": train.Response})
submission[['Id',
'Prediction',
'Response']].to_csv('rawtrainxgbsubmission'+str(folds)+'.csv',
index=False)
submission = pd.DataFrame({"Id": test.Id.values,
"Response": testpredictions/folds})
submission[['Id', 'Response']].to_csv('rawxgbsubmission'+str(folds)+'.csv',
index=False)
y_pred = (testpredictions/folds > .08).astype(int)
submission = pd.DataFrame({"Id": test.Id.values,
"Response": y_pred})
submission[['Id', 'Response']].to_csv('xgbsubmission'+str(folds)+'.csv',
index=False)
if __name__ == "__main__":
print('Started')
Train()
print('Finished')
值得学习的几个点:
1、自定义评价函数
2、使用early_stop来防止过拟合
3、train,test数据的生成方式简洁
4、对于同一个模型,跑n-fold取平均。