机器学习分类—stacking方法

最新推荐文章于 2024-06-28 10:30:23 发布

蛙子

最新推荐文章于 2024-06-28 10:30:23 发布

阅读量1.6k

点赞数 1

文章标签：机器学习人工智能分类 stacking kaggle

本文链接：https://blog.csdn.net/weixin_39989705/article/details/90145022

版权

1.一个简单的单层stacking

import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import warnings
from numpy import hstack,vstack,array,nan
warnings.filterwarnings('ignore')
from sklearn import datasets
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
import numpy as np
from sklearn import metrics
from sklearn import preprocessing
import pandas as pd
import csv
from pandas import Series, DataFrame
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import BaggingClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,GradientBoostingClassifier, ExtraTreesClassifier

# 导入数据集切割训练与测试数据
%matplotlib inline
PATH_TEST = './data/FT_Camp_5/X_test.csv'
PATH_TRAIN = './data/FT_Camp_5/Train.csv'
#数据预处理
train_df = pd.read_csv('./data/FT_Camp_5/Train.csv',index_col=0)

test_df = pd.read_csv('./data/FT_Camp_5/X_test.csv',index_col=0)

y_train = train_df.pop('fake')
all_df = pd.concat((train_df, test_df), axis=0)

all_df.isnull().sum().sort_values(ascending=False).head()#查看缺失值
mean_cols = all_df.mean()
mean_cols.head(10)
all_df = all_df.fillna(mean_cols)
all_df.isnull().sum().sum()

numeric_cols = all_df.columns[all_df.dtypes != 'object']
numeric_col_means = all_df.loc[:, numeric_cols].mean()
numeric_col_std = all_df.loc[:, numeric_cols].std()
all_df.loc[:, numeric_cols] = (all_df.loc[:, numeric_cols] - numeric_col_means) / numeric_col_std

dummy_train_df = all_df.loc[train_df.index]
dummy_test_df = all_df.loc[test_df.index]

train_df_xy = pd.concat((dummy_train_df,y_train), axis=1)

train2_df = train_df_xy.iloc[:2550,:]
train3_df = train_df_xy.iloc[3825:,:]
train1_df = train_df_xy.iloc[2550:3825,:]
train_df_xy = pd.concat((train2_df,train3_df), axis=0)

train1_label = train1_df.pop('fake')

#add_df = train_df_xy[train_df_xy['fake'].isin([1])]
#train_df_xy = pd.concat((add_df,train_df_xy,add_df,add_df,add_df,add_df,add_df,add_df,add_df,add_df,add_df,add_df,add_df,add_df,add_df,add_df), axis=0)

#train_df_xy = train_df_xy.sample(frac = 1)




#y_train = train_df_xy.pop('fake')

#dummy_train_df = train_df_xy

def get_id(path):
    id_list = []
    csv_reader = csv.reader(open(path))
    for row in csv_reader:
        id_list.append(row[0])
    id_array = np.array(id_list)
    return id_array

dummy_train_df = train_df_xy
y_train = train_df_xy.pop('fake')
#oversampler = SMOTE(ratio={1:7000}, random_state=np.random.randint(100), k_neighbors=5, m_neighbors=10, kind='regular', n_jobs=-1)
#os_X_train, os_y_train = oversampler.fit_sample(dummy_train_df,y_train)
#dummy_train_df = os_X_train
#y_train = os_y_train

x1_test = np.array(train1_df,dtype = float)
y1_test = np.array(train1_label,dtype = int)
y_train = np.array(y_train,dtype = int)
x_train = np.array(dummy_train_df,dtype = float)
x_test = np.array(dummy_test_df,dtype = float)
id_test = get_id(PATH_TEST)


class Ensemble(object):
    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, X, y, T):
        
        
        X = np.array(X)
        y = np.array(y,dtype = int)
        T = np.array(T)

        folds = list(KFold(n_splits=self.n_splits, shuffle=True, random_state=2016).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        for i, clf in enumerate(self.base_models):

            S_test_i = np.zeros((T.shape[0], self.n_splits))

            for j, (train_idx, test_idx) in enumerate(folds):

                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                #print(X_holdout)
                y_holdout = y[test_idx]
                print ("Fit Model %d fold %d" % (i, j))
                print('y_train',y_train)
                
                clf.fit(X_train, y_train)
                
                y_pred = clf.predict(X_holdout)[:]
                y_pred = np.array(y_pred,dtype=int)
                for i in y_pred:
                    if int(i)==1:
                        print(i)
                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict(T)[:]
            S_test[:, i] = S_test_i.mean(axis=1)
            #print(S_test)
        results = cross_val_score(self.stacker, S_train, y, cv=5, scoring='r2')
        print("Stacker score: %.4f (%.4f)" % (results.mean(), results.std()))
        exit()
        
        self.stacker.fit(S_train, y)
        res = self.stacker.predict(S_test)[:]
        for i in res:
            if int(i)==1:
                print(i)
        return res

    
    


# rf params
rf_params = {}
rf_params['n_estimators'] = 50
rf_params['max_depth'] = 8
rf_params['min_samples_split'] = 100
rf_params['min_samples_leaf'] = 30

# xgb params
xgb_params = {}
xgb_params['n_estimators'] = 50
xgb_params['min_child_weight'] = 12
xgb_params['learning_rate'] = 0.27
xgb_params['max_depth'] = 6
xgb_params['subsample'] = 0.77
xgb_params['reg_lambda'] = 0.8
xgb_params['reg_alpha'] = 0.4
xgb_params['base_score'] = 0
xgb_params['silent'] = 1

rf_model = RandomForestClassifier(**rf_params)

adb_model = AdaBoostClassifier()

gdbc_model = GradientBoostingClassifier()

et_model = ExtraTreesClassifier()

svc_model = SVC()

xgb_model = XGBRegressor(**xgb_params)

'''
# XGB model
xgb_model = XGBRegressor(**xgb_params)
#xgb_model = BaggingClassifier(base_estimator=xgb_model_, n_estimators=20,max_samples=0.8, max_features=0.8, 
                            #bootstrap=True,bootstrap_features=False, n_jobs=1, random_state=1)

# RF model
rf_model_ = RandomForestClassifier(**rf_params)
rf_model = BaggingClassifier(base_estimator=rf_model_, n_estimators=20,max_samples=0.8, max_features=0.8)
# ET model
et_model_ = SVC()
et_model = BaggingClassifier(base_estimator=et_model_, n_estimators=20,max_samples=0.9, max_features=0.8)
# SVR model
# SVM is too slow in more then 10000 set
#svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.05)

# DecsionTree model
dt_model_ = GradientBoostingClassifier()
dt_model = BaggingClassifier(base_estimator=dt_model_, n_estimators=20,max_samples=0.8, max_features=0.8)
# AdaBoost model
ada_model_ = AdaBoostClassifier()
ada_model = BaggingClassifier(base_estimator=ada_model_, n_estimators=20,max_samples=0.8, max_features=0.8)


svc = SVC()
clf = BaggingClassifier(base_estimator=ada_model_, n_estimators=20,max_samples=0.8, max_features=0.8, 
                            bootstrap=True,bootstrap_features=False, n_jobs=1, random_state=1)'''
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()

stack = Ensemble(n_splits=3,
        stacker=clf,
        base_models=(rf_model,adb_model, gdbc_model, et_model, svc_model,xgb_model))


y_test = stack.fit_predict(x_train, y_train, x_test)

y_pre = stack.fit_predict(x_train, y_train, x1_test)

print(y_pre)

f1score_val = f1_score(y1_test, y_pre, average='binary')

    
print('f1 score val:',f1score_val)

csvFile2 = open('stacking.csv', 'w', newline='')  # 设置newline，否则两行之间会空一行
writer = csv.writer(csvFile2)
writer.writerow(['stockcode', 'fake'])
m = len(y_test)
print('文件生成中...')
for i in range(m):
    writer.writerow([int(id_test[i+1]), int(y_test[i])])
csvFile2.close()

2.kaggle大神开源的代码

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
from numpy import hstack,vstack,array,nan
from sklearn.metrics import precision_score, recall_score, f1_score
import csv
PATH_TEST = './data/FT_Camp_5/X_test.csv'
PATH_TRAIN = './data/FT_Camp_5/Train.csv'
warnings.filterwarnings('ignore')

#把测试的和训练的放在一起归一化
train_df = pd.read_csv(PATH_TRAIN,index_col=0)
test_df = pd.read_csv(PATH_TEST,index_col=0)
fake_df = train_df.pop('fake')

#均值填充缺失值
data = pd.concat((train_df,test_df), axis=0)
mean_cols = data.mean()
#print(mean_cols.head)
data = data.fillna(mean_cols)

numeric_cols = all_df.columns[all_df.dtypes != 'object']
numeric_col_means = all_df.loc[:, numeric_cols].mean()
numeric_col_std = all_df.loc[:, numeric_cols].std()
all_df.loc[:, numeric_cols] = (all_df.loc[:, numeric_cols] - numeric_col_means) / numeric_col_std

#data=data[~data['A_to_L_ratio'].isin([175086.8088])]
# sklearn数据预处理模块做数据预处理
from sklearn.preprocessing import StandardScaler
# sklearn的reshape函数设置-1，即让其自动计算有多少行 得到新的

dummy_train_df = data.loc[train_df.index]
dummy_test_df = data.loc[test_df.index]


#data.describe().T#这里还有很多值没有完全归一化
data = pd.concat((dummy_train_df,fake_df), axis=1)

# 计算class=0的有多少个 =1的有多少个
count_classes = pd.value_counts(data['fake'], sort = True).sort_index()
count_classes.plot(kind = 'bar')
plt.title("fake class histogram")
plt.xlabel("fake")
plt.ylabel("Frequency")

# 取出x值和label值
X = data.ix[:, data.columns != 'fake']
y = data.ix[:, data.columns == 'fake']

# 拿到所有负例的个数和它们的index，用来做过采样
number_records_fake = len(data[data.fake == 1])
fake_indices = np.array(data[data.fake == 1].index)

# 拿到所有正例的index，如果是做欠采样，那么需要通过index随机取
normal_indices = data[data.fake == 0].index

# 从正例的数据集中采集负例个数的样本
random_normal_indices = np.random.choice(normal_indices, number_records_fake, replace = False)
# 转换为numpy的格式
random_normal_indices = np.array(random_normal_indices)

# 合并正例和负例样本
under_sample_indices = np.concatenate([fake_indices,random_normal_indices])

# pandas的索引来重新赋值
under_sample_data = data.loc[under_sample_indices,:]

X_undersample = under_sample_data.ix[:, under_sample_data.columns != 'fake']
y_undersample = under_sample_data.ix[:, under_sample_data.columns == 'fake']

# Showing ratio
print("Percentage of normal transactions: ", len(under_sample_data[under_sample_data.fake == 0])/len(under_sample_data))
print("Percentage of fake transactions: ", len(under_sample_data[under_sample_data.fake == 1])/len(under_sample_data))
print("Total number of transactions in resampled data: ", len(under_sample_data))


from sklearn.model_selection import GridSearchCV, train_test_split

# 原始数据集切分 random_state 为洗牌随机切分 -- 切分后的验证集可以用于验证下采样的模型好坏
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 0)

print("Number transactions train dataset: ", len(X_train))
print("Number transactions test dataset: ", len(X_test))
print("Total number of transactions: ", len(X_train)+len(X_test))

# 下采样数据集切分
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample
                                                                                                   ,y_undersample
                                                                                                   ,test_size = 0.3
                                                                                                   ,random_state = 0)
print("")
print("Number transactions train dataset: ", len(X_train_undersample))
print("Number transactions test dataset: ", len(X_test_undersample))
print("Total number of transactions: ", len(X_train_undersample)+len(X_test_undersample))

#Recall = TP/(TP+FN)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix,recall_score,classification_report

def get_id(path):
    id_list = []
    csv_reader = csv.reader(open(path))
    for row in csv_reader:
        id_list.append(row[0])
    id_array = np.array(id_list)
    return id_array

id_test = get_id(PATH_TEST)
#print('#########################################',id_test)
x_test_real = np.array(dummy_test_df,dtype = float)


def printing_Kfold_scores(x_train_data,y_train_data):
# 5折交叉验证
    fold = KFold(5,shuffle=False) #len(y_train_data),

    # 由于不知道具体的C值 这里几个先验值逐一试试
    c_param_range = [0.01,0.1,1,10,100]

    results_table = pd.DataFrame(index = range(len(c_param_range),2), columns = ['C_parameter','Mean recall score'])
    results_table['C_parameter'] = c_param_range

    # the k-fold will give 2 lists: train_indices = indices[0], test_indices = indices[1]
    j = 0
    for c_param in c_param_range:
        print('-------------------------------------------')
        print('C parameter: ', c_param)
        print('-------------------------------------------')
        print('')
        # sklearn KFold交叉验证分组情况样本
        recall_accs = []
        auc_accs = []
        f1_accs = []
        for iteration, indices in enumerate(fold.split(x_train_data),start=1):
            # 正则化选择参数：penalty 优化算法选择参数：solver 分类方式选择参数：multi_class 类型权重参数： class_weight 样本权重参数： sample_weight
            lr = LogisticRegression(C = c_param, penalty = 'l1')

            # Use the training data to fit the model. In this case, we use the portion of the fold to train the model
            # with indices[0]. We then predict on the portion assigned as the 'test cross validation' with indices[1]
            # iloc比较简单，它是基于索引位来选取数据集，0:4就是选取 0，1，2，3这四行
            lr.fit(x_train_data.iloc[indices[0],:],y_train_data.iloc[indices[0],:].values.ravel())

            # 获取预测值
            y_pred_undersample = lr.predict(x_train_data.iloc[indices[1],:].values)
            #for i in y_pred_undersample:
                #if i==1:
                    #print(i)
            # Calculate the recall score and append it to a list for recall scores representing the current c_parameter
            recall_acc = recall_score(y_train_data.iloc[indices[1],:].values,y_pred_undersample)
            #auc_acc = recall_score(y_train_data.iloc[indices[1],:].values,y_pred_undersample)
            f1_acc = f1_score(y_train_data.iloc[indices[1],:].values,y_pred_undersample)
            f1_accs.append(f1_acc)
            recall_accs.append(recall_acc)
            print('Iteration ', iteration,': recall score = ', recall_acc,'f1 score = ',f1_acc)

        # The mean value of those recall scores is the metric we want to save and get hold of.
        results_table.ix[j,'Mean recall score'] = np.mean(recall_accs)
        results_table.ix[j,'Mean f1 score'] = np.mean(f1_accs)

        j += 1
        print('')
        print('Mean recall score ', np.mean(recall_accs))
        print('Mean f1 score ', np.mean(f1_accs))

        print('')
    
    #best_c = results_table.loc[results_table['Mean recall score'].idxmax()]['C_parameter']
    
    # Finally, we can check which C parameter is the best amongst the chosen.
    print('*********************************************************************************')
    print(results_table)
    #print('Best model to choose from cross validation is with C parameter = ', best_c)
    print('*********************************************************************************')
    
    #return best_c

printing_Kfold_scores(X_train_undersample,y_train_undersample)

def test(x_train_data,y_train_data,x_test_data):
    lr = LogisticRegression(C = 1.0, penalty = 'l1')
    #训练
    lr.fit(x_train_data,y_train_data)
    # 获取预测值
    y_pred_undersample = lr.predict(x_test_data)

    csvFile2 = open('csvFile5.csv', 'w', newline='')  # 设置newline，否则两行之间会空一行
    writer = csv.writer(csvFile2)
    writer.writerow(['stockcode', 'fake'])
    m = len(y_pred_undersample)
    print('文件生成中...')
    for i in range(m):
        writer.writerow([int(id_test[i+1]), int(y_pred_undersample[i])])
    csvFile2.close()
test(X_train_undersample,y_train_undersample,x_test_real)