import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from scipy import sparse
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")
train=pd.read_csv('F:/MyGit/competition/BankSale/train_set.csv')
test =pd.read_csv('F:/MyGit/competition/BankSale/test_set.csv')
data=pd.concat([train,test])
data.head()
def feature_count(data, features):
feature_name = 'count'
for i in features:
feature_name += '_' + i
temp = data.groupby(features).size().reset_index().rename(columns={0: feature_name})
data = data.merge(temp, 'left', on=features)
return data,feature_name
feature=data.columns.tolist()
feature.remove('ID')
feature.remove('y')
sparse_feature= ['campaign','contact','default','education','housing','job','loan','marital','month','poutcome']
dense_feature=list(set(feature)-set(sparse_feature))
ll=[]
for f in['campaign', 'contact','default','education','housing','job','loan','marital','poutcome']:
data,_=feature_count(data,['day','month',f])
ll.append(_)
def get_new_columns(name,aggs):
l=[]
for k in aggs.keys():
for agg in aggs[k]:
if str(type(agg))=="<class 'function'>":
l.append(name + '_' + k + '_' + 'other')
else:
l.append(name + '_' + k + '_' + agg)
return l
for d in tqdm(sparse_feature):
aggs={}
for s in sparse_feature:
aggs[s]=['count','nunique']
for den in dense_feature:
aggs[den]=['mean','max','min','std']
aggs.pop(d)
temp=data.groupby(d).agg(aggs).reset_index()
temp.columns=[d]+get_new_columns(d,aggs)
data=pd.merge(data,temp,on=d,how='left')
for s in ['campaign','contact','default','education','housing','job','loan','marital','month','poutcome']:
data=pd.concat([data,pd.get_dummies(data[s],prefix=s+'_')],axis=1)
data.drop(s,axis=1,inplace=True)
import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import Imputer
df_train=data[data['y'].notnull()]
df_test=data[data['y'].isnull()]
target=df_train['y']
df_train_columns=df_train.columns.tolist()
df_train_columns.remove('ID')
df_train_columns.remove('y')
train_data = df_train.iloc[:,:][df_train_columns]
train_target =target
train_data = Imputer(missing_values='NaN', strategy = 'mean', axis = 0, verbose =0, copy = True).fit_transform(train_data)
Xtrain, Xtest, ytrain, ytest = train_test_split(train_data, train_target, test_size=0.3, random_state=666)
alg1 = RandomForestClassifier(n_estimators=181, max_depth=14, random_state=90, n_jobs=-1)
alg2 = GradientBoostingClassifier(random_state=1,n_estimators=25,max_depth=3)
alg3 = LogisticRegression(random_state=1,solver='lbfgs',max_iter=10000)
#alg4 = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',max_iter=-1, probability=False, random_state=None, shrinking=True,tol=0.001, verbose=False)
alg5 = GaussianNB()
param = {'num_leaves': 31,
'min_data_in_leaf': 30,
'objective':'binary',
'max_depth': -1,
'learning_rate': 0.01,
"min_child_samples": 20,
"boosting": "gbdt",
"feature_fraction": 0.9,
"bagging_freq": 1,
"bagging_fraction": 0.9 ,
"bagging_seed": 11,
"metric": 'auc',
"lambda_l1": 0.1,
"verbosity": -1,
"nthread": 4,
"random_state": 666}
num_round = 10000
trn_data = lgb.Dataset(Xtrain, label=ytrain)#, categorical_feature=categorical_feats)
val_data = lgb.Dataset(Xtest, label=ytest)#, categorical_feature=categorical_feats)
###train
alg1.fit(Xtrain, ytrain)
alg2.fit(Xtrain, ytrain)
alg3.fit(Xtrain, ytrain)
#alg4.fit(Xtrain, ytrain)
alg5.fit(Xtrain, ytrain)
alg6 = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 100)
###train
full_test_predictions=[]
full_test_predictions.append(alg1.predict(Xtrain.astype(float)))
full_test_predictions.append(alg2.predict(Xtrain.astype(float)))
full_test_predictions.append(alg3.predict(Xtrain.astype(float)))
#full_test_predictions.append(alg4.predict(Xtrain.astype(float)))
full_test_predictions.append(alg5.predict(Xtrain.astype(float)))
alg6_predictions = alg6.predict(Xtrain, num_iteration=alg6.best_iteration)
alg6_predictions[alg6_predictions<=0.5]=0
alg6_predictions[alg6_predictions>0.5]=1
full_test_predictions.append( alg6_predictions)
#alg6_predictions = np.array(([alg6_predictions])).T
#full_test_predictions = np.concatenate((full_test_predictions,alg6_predictions),axis=1)
print(full_test_predictions)
full_test_predictions = np.array(full_test_predictions).T
print (alg6_predictions)
print(len(alg6_predictions))
print(len(full_test_predictions))
ensemble = BernoulliNB()
ensemble.fit( full_test_predictions, ytrain)
ensemble.score(full_test_predictions,ytrain)
### test
full_test_predictions = []
full_test_predictions.append(alg1.predict(Xtest.astype(float)))
full_test_predictions.append(alg2.predict(Xtest.astype(float)))
full_test_predictions.append(alg3.predict(Xtest.astype(float)))
#full_test_predictions.append(alg4.predict(Xtest.astype(float)))
full_test_predictions.append(alg5.predict(Xtest.astype(float)))
temp_predictions = alg6.predict(Xtest, num_iteration=alg6.best_iteration)
temp_predictions[temp_predictions<=0.5]=0
temp_predictions[temp_predictions>0.5]=1
full_test_predictions.append( temp_predictions)
print(full_test_predictions)
### test
full_test_predictions = np.array(full_test_predictions).T
ensemble.score(full_test_predictions,ytest)
sub=df_test[['ID']]
sub['pred']=predictions
sub.to_csv('/home/kesci/work/Result.csv',index=False)