数据挖掘自存代码~

一天天的就知道学习

已于 2022-04-10 15:27:11 修改

阅读量265

点赞数 1

分类专栏：数据挖掘自存笔记文章标签： python 人工智能机器学习

于 2020-06-03 16:41:35 首次发布

本文链接：https://blog.csdn.net/qq_35679701/article/details/106527747

版权

笔记同时被 3 个专栏收录

18 篇文章 1 订阅

订阅专栏

自存

17 篇文章 0 订阅

订阅专栏

数据挖掘

13 篇文章 2 订阅

订阅专栏

方法 2：缺失数据百分比列表

内存压缩

def reduce_mem_usage(df, verbose=True):
    '''reduce RAM usage
    '''
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

用随机森林回归填补缺失值（特征矩阵中的缺失值）

'''
参数：
    X_missing：要填补的数据集(不含y标签,原数据集含y标签要拆开)
    y_full：数据集的标签
    fill_value：填充方式
def fill_missing_rf(X_missing,y_full,fill_value):

'''
def fill_missing_rf(X_missing,y_full,fill_value):
    X_missing_reg = X_missing.copy()  #用回归填补缺失值的特征矩阵
    #找出数据集中，缺失值从小到大排列的特征值的顺序
    # argsort和sort的区别是，前者排完之后还有索引
    sortindex = np.argsort(X_missing_reg.isnull().sum(axis=0)).values   # axis=0 按列进行加和
    for i in sortindex:
        # 构建我们的新特征矩阵（没有被选中去填充的特征们+原始的标签）和新标签（被选中去填充的特征）
        df = X_missing_reg
        # 新标签
        fillc = df.iloc[:,i]  # 第六列所有的行
        # 新特征矩阵
        df = pd.concat([df.iloc[:,df.columns != i],pd.DataFrame(y_full)],axis=1)  # 去掉第i列的所有行+原数据集标签
        # 在新特征矩阵中，对含有缺失值的列，进行0的填补
        '''
        #这里的填充值其实可以根据情况来,如果用其他的均值等填补,对应参数也要改
        '''
        df_0 = SimpleImputer(missing_values=np.nan,strategy='constant',fill_value = fill_value).fit_transform(df)
        # 找出我们的训练集和测试集
        Ytrain = fillc[fillc.notnull()]  # 被选中要填充的特征中（现在是我们的标签），存在的哪些值，非空值
        Ytest = fillc[fillc.isnull()]   # 是被选中的要填充的特种中（现在是我们的标签），不存在的那些值，空值
        # 我们需要的不是Ytest的值，而是Ytest所带的索引
        Xtrain = df_0[Ytrain.index,:]   # 在新特征矩阵上，被选出来的要填充的特征的非空值所对应的记录
        Xtest = df_0[Ytest.index,:]    # 新特征矩阵上，被选出来的要填充的特征所对应的空值所对应的记录
        # 用随机森林的回归来填补缺失值
         from sklearn.ensemble import RandomForestClassifier as rfr
        rfc = RandomForestRegressor(n_estimators = 100,random_state=233)#实例化
        rfc = rfc.fit(Xtrain,Ytrain)  # 导入训练集去进行训练
        Ypredict = rfc.predict(Xtest) # 用predict接口将Xtest导入，得到我们的预测结果（回归结果），这个预测结果就是我们要用来填补空值的值
        # 将填补好的特征返回到我们的原始特征矩阵中
        X_missing_reg.loc[X_missing_reg.iloc[:,i].isnull(),i] = Ypredict
    return X_missing_reg
X_missing_reg = fill_missing_rf(X_missing,y_full,0)
X_missing_reg#填补完的数据集可以直接用

XGB.CV学习曲线，回归为例

'''
XGB.CV学习曲线，回归为例
'''
#参数网址：https://xgboost.readthedocs.io/en/latest/parameter.html?highlight=metrics
#load the libraries
from sklearn.datasets import load_boston
import matplotlib.pyplot as plt
import warnings
import xgboost as xgb
warnings.filterwarnings('ignore')
#load data
data = load_boston()
X = data.data
y = data.target
#transform form of data
dfull = xgb.DMatrix(X,y)
#XGB.cv下的学习曲线

#设置待调参数范围
axisx = range(0,20,1)
train_rs = []
test_rs = []
for i in axisx:
    #set params
    param = {'silent ':False
             ,'obj': 'reg:linear'
             ,"gamma":i
        }
    num_round= 180
    n_fold = 5
    cvresult = xgb.cv(param,dfull,num_round,n_fold,metrics='rmse')
    '''
    #     因为xgb.cv返回的是max为num_round数量下每种树数量的4种评估值，
    #     行是0~n_round
    #     列是	train-rmse-mean	train-rmse-std	test-rmse-mean	test-rmse-std
    #     所以取最后一行和对应的列
    '''
#     print(train_rs,test_rs)
train_rs.append(float(cvresult.iloc[-1:,0]))
test_rs.append(float(cvresult.iloc[-1:,2]))
print('the best effect is {}st on train,the value is {}'.format(axisx[train_rs.index(max(train_rs))],max(train_rs)))
print('the best effect is {}st on test,the value is {}'.format(axisx[test_rs.index(max(test_rs))],max(test_rs)))
plt.figure(figsize=(20,5))
plt.plot(axisx,train_rs,c="red",label="XGB-train")
plt.plot(axisx,test_rs,c="blue",label="XGB-test")
plt.legend()
plt.show()

XGB网格搜索，分类为例


'''
XGB网格搜索，分类为例
'''
%%time
#load libraries
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.datasets import load_iris
#load data
iris = load_iris()
X,y = iris.data,iris.target
col = iris.target_names
train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.3, random_state=1)   # 分训练集和验证集
#set params
parameters = {
              'max_depth': [5,6],
              'learning_rate': [0.01,0.02],
              'n_estimators': [500,600],
              'min_child_weight': [0,1],
              'max_delta_step': [0,1],
              'subsample': [0.6,0.7],
              'colsample_bytree': [0.5,0.6],
              'reg_alpha': [0,1],
              'reg_lambda': [0.2,0.3],
              'scale_pos_weight': [0.2,0.3]

}

xlf = xgb.XGBClassifier(max_depth=10,
            learning_rate=0.01,
            n_estimators=2000,
            silent=True,
            objective='multi:softmax',
            num_class=3 ,
            nthread=-1,
            gamma=0,
            min_child_weight=1,
            max_delta_step=0,
            subsample=0.85,
            colsample_bytree=0.7,
            colsample_bylevel=1,
            reg_alpha=0,
            reg_lambda=1,
            scale_pos_weight=1,
            seed=0,
            missing=None)
#scoring：https://blog.csdn.net/u014652309/article/details/84979497
gs = GridSearchCV(xlf, param_grid=parameters, scoring='accuracy', cv=5,n_jobs=-1)
gs.fit(train_x, train_y)

print("Best score: %0.3f" % gs.best_score_)
print("Best parameters set: %s" % gs.best_params_ )

Xgboost使用sklearn的评估接口


'''
使用sklearn的评估接口#菜菜xgboostP43页
'''
#导入模型评估指标
from sklearn.metrics import accuracy_score as accuracy, recall_score as recall, 
roc_auc_score as auc
for name,i in zip(names,scale_pos_weight):
    param= {'silent':True,'objective':'binary:logistic'
           ,"eta":0.1,"scale_pos_weight":i}
    clf = xgb.train(param, dtrain, num_round)
    preds = clf.predict(dtest)
    ypred = preds.copy()
    ypred[preds > 0.5] = 1
    ypred[ypred != 1] = 0
    print(name)
    print("\tAccuracy:{}".format(accuracy(Ytest,ypred)))
    print("\tRecall:{}".format(recall(Ytest,ypred)))
    print("\tAUC:{}".format(auc(Ytest,preds)))

Xgboost使用Hyperopt调参


'''steo0 
#导入模型评估指标
#load libraries
'''
from sklearn.metrics import roc_auc_score
from hyperopt import fmin, tpe, hp, partial
import xgboost as xgb


'''
step1 load data(注意训练和测试集)
''' 

dtrain = xgb.DMatrix(data=Xtrain,label=Ytrain)
dtest = xgb.DMatrix(data=Xtest,label=Ytest)
evallist = [(dtest, 'eval'), (dtrain, 'train')]

'''
step234的开启参数一定要一致
step2设置hyperopt的参数空间（格式范围为下限，上限）
PS:
①最低2个参数空间
②固定参数不想调的参数可以用极小的范围来变相固定
''' 
#
space = {

        # 参数格式hp.randint,hp.uniform,测试下来max_depth+lambda效果最好
         "n_estimators": hp.randint("n_estimators",150,250)
#         ,'learning_rate': hp.uniform('learning_rate',2e-1, (5e-1)+(1e-12))#学习[0,1]
         ,'learning_rate': hp.uniform('learning_rate',2e-2, 5e-2)#学习[0,1]

#         #or
        ,'max_depth': hp.randint("max_depth",15,25) # 最大深度
        ,'gamma': hp.uniform('gamma', 10,20)  # 是否后剪枝[0,无穷]
#         #&
        ,'subsample': hp.uniform('subsample', 2e-1, 4e-1)  # 采样数(0,1]
        ,'colsample_bytree': hp.uniform('colsample_bytree', 1e-1, 4e-1)  # 样本列采样[0,1]
#         #or
        ,'alpha':  hp.randint("alpha",0,20) # L1 正则化[0,无穷大]
        ,'lambda':  hp.randint("lambda",0,30)  # L2 正则化[0,无穷大]
         }

'''
step3 设置XGBOOST工厂模式
PS:
①注意开启的参数空间要和hypter一致
'''

def xgboost_factory(argsDict):
    argsDict = argsDict_tranform(argsDict)
    
    params = {
        'nthread': -1  # 进程数
        ,'silent': True
        ,'objective': 'binary:hinge'
        ,'booster':'dart'
        #&
        ,'n_estimators': argsDict['n_estimators']  # 树的数量
        ,'eta': argsDict['learning_rate']  # 学习率
#         #or
        ,'max_depth': argsDict['max_depth'] # 最大深度
        ,'gamma': argsDict['gamma'] # 是否后剪枝
#         #&
        ,'subsample': argsDict['subsample']  # 采样数
        ,'colsample_bytree': argsDict['colsample_bytree']  # 样本列采样
#         #or
        ,'alpha': argsDict['alpha']  # L1 正则化
        ,'lambda': argsDict['lambda']  # L2 正则化
        
#         #other params
        ,'min_child_weight': 1  # 终点节点最小样本占比的和
        ,'scale_pos_weight': 0  # 取值>0时,在数据不平衡时有助于收敛
        ,'seed': 233  # 随机种子
        ,'missing': -999  # 填充缺失值
              }
    #这里要替换
    params['eval_metric'] = ['auc']

    xrf = xgb.train(params, dtrain, params['n_estimators'], evallist,early_stopping_rounds=100)

    return get_tranformer_score(xrf)
    
'''
step4 转换参数范围
PS:
①注意开启的参数空间要和hypter一致
'''
def argsDict_tranform(argsDict, isPrint=False):
    argsDict['n_estimators'] = argsDict['n_estimators']
    argsDict["learning_rate"] = argsDict["learning_rate"]
#ps:下面的要转换格式
#         #or
    argsDict["max_depth"] = argsDict["max_depth"] + 5 # 最大深度
    argsDict['gamma']= argsDict['gamma'] # 是否后剪枝
#         #&
    argsDict['subsample'] = argsDict['subsample']  # 采样数
    argsDict['colsample_bytree'] = argsDict['colsample_bytree']  # 样本列采样
#         #or
    argsDict['alpha'] = argsDict['alpha']  # L1 正则化
    argsDict['lambda']= argsDict['lambda']  # L2 正则化
        
    if isPrint:
        print(argsDict)
    else:
        pass

    return argsDict

'''
step5 获取最好分数
PS:
'''

def get_tranformer_score(tranformer):
    
    xrf = tranformer
    dpredict = xgb.DMatrix(Xtest)#注意这里是Xtest
    prediction = xrf.predict(dpredict, ntree_limit=xrf.best_ntree_limit)
  
    return r2_score(Ytest, prediction)#注意这里评分要自己改

# 开始使用hyperopt进行自动调参
algo = partial(tpe.suggest, n_startup_jobs=1)
best = fmin(xgboost_factory, space, algo=algo, max_evals=20, pass_expr_memo_ctrl=None)

'''
step6 打印最好分数
PS:
'''
roc_auc_score = xgboost_factory(best) #这里根据评分改
print('best :', best)
print('best param after transform :')
argsDict_tranform(best,isPrint=True)
# print('rmse of the best xgboost:', np.sqrt(RMSE))
print('roc_auc_score of the best xgboost:', roc_auc_score)#这里根据评分改

stacking模型融合

第一层基模型过程
在这里插入图片描述
第二层元模型过程

参考1;
参考2

'''
data & models & n_folds
'''
#data
ntrain =train.shape[0]
ntest = test.shape[0]
#models
clf1 = GBDT(n_estimators=100)
clf2 = RF(n_estimators=100)
#n_folds & kf
n_folds = 6
'''
网上用kf和skf的都有,个人认为在这里应该是skf,因为skf是根据标签中的分类比例来划分,
这样哪怕在数据不均衡的时候还是按比例划分折数,否则的话28这这样的情况在KF划5折,
可能第1折里少数类就被全部划走了
'''
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=2333)#分层抽样
#kf=KFold(n_split = n_folds,random_state=2333)

'''
定义stacking函数
'''
def get_oof(clf, x_train, y_train, x_test):
 oof_train = np.zeros((ntrain,))  
 oof_test = np.zeros((ntest,))
 oof_test_skf = np.empty((n_folds, ntest))  #n_folds行，ntest列的二维array
 #留1训练预测执行i次(图片上半部分)
 for i, (train_index, test_index) in enumerate(skf.split(X_train,y_train)): #循环n_folds次
     x_tr = x_train[train_index]
     y_tr = y_train[train_index]
     x_te = x_train[test_index]
     clf.fit(x_tr, y_tr)
     #取出1生成stacking训练数据集
     oof_train[test_index] = clf.predict(x_te)
     oof_test_skf[i, :] = clf.predict(x_test)  #固定行填充，循环一次，填充一行
 #stacking测试集求平均(图片下半部分)
 oof_test[:] = oof_test_skf.mean(axis=0)  #axis=0,按列求平均，最后保留一行
 #     print("分类器：{}".format(clf))
 return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)  #转置，从一行变为一列
'''
使用oof_train和oof_test进行用逻辑回归第二层的训练
'''