1.一个简单的单层stacking
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import warnings
from numpy import hstack,vstack,array,nan
warnings.filterwarnings('ignore')
from sklearn import datasets
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
import numpy as np
from sklearn import metrics
from sklearn import preprocessing
import pandas as pd
import csv
from pandas import Series, DataFrame
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import BaggingClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,GradientBoostingClassifier, ExtraTreesClassifier
# 导入数据集切割训练与测试数据
%matplotlib inline
PATH_TEST = './data/FT_Camp_5/X_test.csv'
PATH_TRAIN = './data/FT_Camp_5/Train.csv'
#数据预处理
train_df = pd.read_csv('./data/FT_Camp_5/Train.csv',index_col=0)
test_df = pd.read_csv('./data/FT_Camp_5/X_test.csv',index_col=0)
y_train = train_df.pop('fake')
all_df = pd.concat((train_df, test_df), axis=0)
all_df.isnull().sum().sort_values(ascending=False).head()#查看缺失值
mean_cols = all_df.mean()
mean_cols.head(10)
all_df = all_df.fillna(mean_cols)
all_df.isnull().sum().sum()
numeric_cols = all_df.columns[all_df.dtypes != 'object']
numeric_col_means = all_df.loc[:, numeric_cols].mean()
numeric_col_std = all_df.loc[:, numeric_cols].std()
all_df.loc[:, numeric_cols] = (all_df.loc[:, numeric_cols] - numeric_col_means) / numeric_col_std
dummy_train_df = all_df.loc[train_df.index]
dummy_test_df = all_df.loc[test_df.index]
train_df_xy = pd.concat((dummy_train_df,y_train), axis=1)
train2_df = train_df_xy.iloc[:2550,:]
train3_df = train_df_xy.iloc[3825:,:]
train1_df = train_df_xy.iloc[2550:3825,:]
train_df_xy = pd.concat((train2_df,train3_df), axis=0)
train1_label = train1_df.pop('fake')
#add_df = train_df_xy[train_df_xy['fake'].isin([1])]
#train_df_xy = pd.concat((add_df,train_df_xy,add_df,add_df,add_df,add_df,add_df,add_df,add_df,add_df,add_df,add_df,add_df,add_df,add_df,add_df), axis=0)
#train_df_xy = train_df_xy.sample(frac = 1)
#y_train = train_df_xy.pop('fake')
#dummy_train_df = train_df_xy
def get_id(path):
id_list = []
csv_reader = csv.reader(open(path))
for row in csv_reader:
id_list.append(row[0])
id_array = np.array(id_list)
return id_array
dummy_train_df = train_df_xy
y_train = train_df_xy.pop('fake')
#oversampler = SMOTE(ratio={1:7000}, random_state=np.random.randint(100), k_neighbors=5, m_neighbors=10, kind='regular', n_jobs=-1)
#os_X_train, os_y_train = oversampler.fit_sample(dummy_train_df,y_train)
#dummy_train_df = os_X_train
#y_train = os_y_train
x1_test = np.array(train1_df,dtype = float)
y1_test = np.array(train1_label,dtype = int)
y_train = np.array(y_train,dtype = int)
x_train = np.array(dummy_train_df,dtype = float)
x_test = np.array(dummy_test_df,dtype = float)
id_test = get_id(PATH_TEST)
class Ensemble(object):
def __init__(self, n_splits, stacker, base_models):
self.n_splits = n_splits
self.stacker = stacker
self.base_models = base_models
def fit_predict(self, X, y, T):
X = np.array(X)
y = np.array(y,dtype = int)
T = np.array(T)
folds = list(KFold(n_splits=self.n_splits, shuffle=True, random_state=2016).split(X, y))
S_train = np.zeros((X.shape[0], len(self.base_models)))
S_test = np.zeros((T.shape[0], len(self.base_models)))
for i, clf in enumerate(self.base_models):
S_test_i = np.zeros((T.shape[0], self.n_splits))
for j, (train_idx, test_idx) in enumerate(folds):
X_train = X[train_idx]
y_train = y[train_idx]
X_holdout = X[test_idx]
#print(X_holdout)
y_holdout = y[test_idx]
print ("Fit Model %d fold %d" % (i, j))
print('y_train',y_train)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_holdout)[:]
y_pred = np.array(y_pred,dtype=int)
for i in y_pred:
if int(i)==1:
print(i)
S_train[test_idx, i] = y_pred
S_test_i[:, j] = clf.predict(T)[:]
S_test[:, i] = S_test_i.mean(axis=1)
#print(S_test)
results = cross_val_score(self.stacker, S_train, y, cv=5, scoring='r2')
print("Stacker score: %.4f (%.4f)" % (results.mean(), results.std()))
exit()
self.stacker.fit(S_train, y)
res = self.stacker.predict(S_test)[:]
for i in res:
if int(i)==1:
print(i)
return res
# rf params
rf_params = {}
rf_params['n_estimators'] = 50
rf_params['max_depth'] = 8
rf_params['min_samples_split'] = 100
rf_params['min_samples_leaf'] = 30
# xgb params
xgb_params = {}
xgb_params['n_estimators'] = 50
xgb_params['min_child_weight'] = 12
xgb_params['learning_rate'] = 0.27
xgb_params['max_depth'] = 6
xgb_params['subsample'] = 0.77
xgb_params['reg_lambda'] = 0.8
xgb_params['reg_alpha'] = 0.4
xgb_params['base_score'] = 0
xgb_params['silent'] = 1
rf_model = RandomForestClassifier(**rf_params)
adb_model = AdaBoostClassifier()
gdbc_model = GradientBoostingClassifier()
et_model = ExtraTreesClassifier()
svc_model = SVC()
xgb_model = XGBRegressor(**xgb_params)
'''
# XGB model
xgb_model = XGBRegressor(**xgb_params)
#xgb_model = BaggingClassifier(base_estimator=xgb_model_, n_estimators=20,max_samples=0.8, max_features=0.8,
#bootstrap=True,bootstrap_features=False, n_jobs=1, random_state=1)
# RF model
rf_model_ = RandomForestClassifier(**rf_params)
rf_model = BaggingClassifier(base_estimator=rf_model_, n_estimators=20,max_samples=0.8, max_features=0.8)
# ET model
et_model_ = SVC()
et_model = BaggingClassifier(base_estimator=et_model_, n_estimators=20,max_samples=0.9, max_features=0.8)
# SVR model
# SVM is too slow in more then 10000 set
#svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.05)
# DecsionTree model
dt_model_ = GradientBoostingClassifier()
dt_model = BaggingClassifier(base_estimator=dt_model_, n_estimators=20,max_samples=0.8, max_features=0.8)
# AdaBoost model
ada_model_ = AdaBoostClassifier()
ada_model = BaggingClassifier(base_estimator=ada_model_, n_estimators=20,max_samples=0.8, max_features=0.8)
svc = SVC()
clf = BaggingClassifier(base_estimator=ada_model_, n_estimators=20,max_samples=0.8, max_features=0.8,
bootstrap=True,bootstrap_features=False, n_jobs=1, random_state=1)'''
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
stack = Ensemble(n_splits=3,
stacker=clf,
base_models=(rf_model,adb_model, gdbc_model, et_model, svc_model,xgb_model))
y_test = stack.fit_predict(x_train, y_train, x_test)
y_pre = stack.fit_predict(x_train, y_train, x1_test)
print(y_pre)
f1score_val = f1_score(y1_test, y_pre, average='binary')
print('f1 score val:',f1score_val)
csvFile2 = open('stacking.csv', 'w', newline='') # 设置newline,否则两行之间会空一行
writer = csv.writer(csvFile2)
writer.writerow(['stockcode', 'fake'])
m = len(y_test)
print('文件生成中...')
for i in range(m):
writer.writerow([int(id_test[i+1]), int(y_test[i])])
csvFile2.close()
2.kaggle大神开源的代码
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
from numpy import hstack,vstack,array,nan
from sklearn.metrics import precision_score, recall_score, f1_score
import csv
PATH_TEST = './data/FT_Camp_5/X_test.csv'
PATH_TRAIN = './data/FT_Camp_5/Train.csv'
warnings.filterwarnings('ignore')
#把测试的和训练的放在一起归一化
train_df = pd.read_csv(PATH_TRAIN,index_col=0)
test_df = pd.read_csv(PATH_TEST,index_col=0)
fake_df = train_df.pop('fake')
#均值填充缺失值
data = pd.concat((train_df,test_df), axis=0)
mean_cols = data.mean()
#print(mean_cols.head)
data = data.fillna(mean_cols)
numeric_cols = all_df.columns[all_df.dtypes != 'object']
numeric_col_means = all_df.loc[:, numeric_cols].mean()
numeric_col_std = all_df.loc[:, numeric_cols].std()
all_df.loc[:, numeric_cols] = (all_df.loc[:, numeric_cols] - numeric_col_means) / numeric_col_std
#data=data[~data['A_to_L_ratio'].isin([175086.8088])]
# sklearn数据预处理模块做数据预处理
from sklearn.preprocessing import StandardScaler
# sklearn的reshape函数设置-1,即让其自动计算有多少行 得到新的
dummy_train_df = data.loc[train_df.index]
dummy_test_df = data.loc[test_df.index]
#data.describe().T#这里还有很多值没有完全归一化
data = pd.concat((dummy_train_df,fake_df), axis=1)
# 计算class=0的有多少个 =1的有多少个
count_classes = pd.value_counts(data['fake'], sort = True).sort_index()
count_classes.plot(kind = 'bar')
plt.title("fake class histogram")
plt.xlabel("fake")
plt.ylabel("Frequency")
# 取出x值和label值
X = data.ix[:, data.columns != 'fake']
y = data.ix[:, data.columns == 'fake']
# 拿到所有负例的个数和它们的index,用来做过采样
number_records_fake = len(data[data.fake == 1])
fake_indices = np.array(data[data.fake == 1].index)
# 拿到所有正例的index,如果是做欠采样,那么需要通过index随机取
normal_indices = data[data.fake == 0].index
# 从正例的数据集中采集负例个数的样本
random_normal_indices = np.random.choice(normal_indices, number_records_fake, replace = False)
# 转换为numpy的格式
random_normal_indices = np.array(random_normal_indices)
# 合并正例和负例样本
under_sample_indices = np.concatenate([fake_indices,random_normal_indices])
# pandas的索引来重新赋值
under_sample_data = data.loc[under_sample_indices,:]
X_undersample = under_sample_data.ix[:, under_sample_data.columns != 'fake']
y_undersample = under_sample_data.ix[:, under_sample_data.columns == 'fake']
# Showing ratio
print("Percentage of normal transactions: ", len(under_sample_data[under_sample_data.fake == 0])/len(under_sample_data))
print("Percentage of fake transactions: ", len(under_sample_data[under_sample_data.fake == 1])/len(under_sample_data))
print("Total number of transactions in resampled data: ", len(under_sample_data))
from sklearn.model_selection import GridSearchCV, train_test_split
# 原始数据集切分 random_state 为洗牌随机切分 -- 切分后的验证集可以用于验证下采样的模型好坏
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 0)
print("Number transactions train dataset: ", len(X_train))
print("Number transactions test dataset: ", len(X_test))
print("Total number of transactions: ", len(X_train)+len(X_test))
# 下采样数据集切分
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample
,y_undersample
,test_size = 0.3
,random_state = 0)
print("")
print("Number transactions train dataset: ", len(X_train_undersample))
print("Number transactions test dataset: ", len(X_test_undersample))
print("Total number of transactions: ", len(X_train_undersample)+len(X_test_undersample))
#Recall = TP/(TP+FN)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix,recall_score,classification_report
def get_id(path):
id_list = []
csv_reader = csv.reader(open(path))
for row in csv_reader:
id_list.append(row[0])
id_array = np.array(id_list)
return id_array
id_test = get_id(PATH_TEST)
#print('#########################################',id_test)
x_test_real = np.array(dummy_test_df,dtype = float)
def printing_Kfold_scores(x_train_data,y_train_data):
# 5折交叉验证
fold = KFold(5,shuffle=False) #len(y_train_data),
# 由于不知道具体的C值 这里几个先验值逐一试试
c_param_range = [0.01,0.1,1,10,100]
results_table = pd.DataFrame(index = range(len(c_param_range),2), columns = ['C_parameter','Mean recall score'])
results_table['C_parameter'] = c_param_range
# the k-fold will give 2 lists: train_indices = indices[0], test_indices = indices[1]
j = 0
for c_param in c_param_range:
print('-------------------------------------------')
print('C parameter: ', c_param)
print('-------------------------------------------')
print('')
# sklearn KFold交叉验证分组情况样本
recall_accs = []
auc_accs = []
f1_accs = []
for iteration, indices in enumerate(fold.split(x_train_data),start=1):
# 正则化选择参数:penalty 优化算法选择参数:solver 分类方式选择参数:multi_class 类型权重参数: class_weight 样本权重参数: sample_weight
lr = LogisticRegression(C = c_param, penalty = 'l1')
# Use the training data to fit the model. In this case, we use the portion of the fold to train the model
# with indices[0]. We then predict on the portion assigned as the 'test cross validation' with indices[1]
# iloc比较简单,它是基于索引位来选取数据集,0:4就是选取 0,1,2,3这四行
lr.fit(x_train_data.iloc[indices[0],:],y_train_data.iloc[indices[0],:].values.ravel())
# 获取预测值
y_pred_undersample = lr.predict(x_train_data.iloc[indices[1],:].values)
#for i in y_pred_undersample:
#if i==1:
#print(i)
# Calculate the recall score and append it to a list for recall scores representing the current c_parameter
recall_acc = recall_score(y_train_data.iloc[indices[1],:].values,y_pred_undersample)
#auc_acc = recall_score(y_train_data.iloc[indices[1],:].values,y_pred_undersample)
f1_acc = f1_score(y_train_data.iloc[indices[1],:].values,y_pred_undersample)
f1_accs.append(f1_acc)
recall_accs.append(recall_acc)
print('Iteration ', iteration,': recall score = ', recall_acc,'f1 score = ',f1_acc)
# The mean value of those recall scores is the metric we want to save and get hold of.
results_table.ix[j,'Mean recall score'] = np.mean(recall_accs)
results_table.ix[j,'Mean f1 score'] = np.mean(f1_accs)
j += 1
print('')
print('Mean recall score ', np.mean(recall_accs))
print('Mean f1 score ', np.mean(f1_accs))
print('')
#best_c = results_table.loc[results_table['Mean recall score'].idxmax()]['C_parameter']
# Finally, we can check which C parameter is the best amongst the chosen.
print('*********************************************************************************')
print(results_table)
#print('Best model to choose from cross validation is with C parameter = ', best_c)
print('*********************************************************************************')
#return best_c
printing_Kfold_scores(X_train_undersample,y_train_undersample)
def test(x_train_data,y_train_data,x_test_data):
lr = LogisticRegression(C = 1.0, penalty = 'l1')
#训练
lr.fit(x_train_data,y_train_data)
# 获取预测值
y_pred_undersample = lr.predict(x_test_data)
csvFile2 = open('csvFile5.csv', 'w', newline='') # 设置newline,否则两行之间会空一行
writer = csv.writer(csvFile2)
writer.writerow(['stockcode', 'fake'])
m = len(y_pred_undersample)
print('文件生成中...')
for i in range(m):
writer.writerow([int(id_test[i+1]), int(y_pred_undersample[i])])
csvFile2.close()
test(X_train_undersample,y_train_undersample,x_test_real)