内存压缩
def reduce_mem_usage(df, verbose=True):
'''reduce RAM usage
'''
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
start_mem = df.memory_usage().sum() / 1024 ** 2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
return df
用随机森林回归填补缺失值(特征矩阵中的缺失值)
'''
参数:
X_missing:要填补的数据集(不含y标签,原数据集含y标签要拆开)
y_full:数据集的标签
fill_value:填充方式
def fill_missing_rf(X_missing,y_full,fill_value):
'''
def fill_missing_rf(X_missing,y_full,fill_value):
X_missing_reg = X_missing.copy() #用回归填补缺失值的特征矩阵
#找出数据集中,缺失值从小到大排列的特征值的顺序
# argsort和sort的区别是,前者排完之后还有索引
sortindex = np.argsort(X_missing_reg.isnull().sum(axis=0)).values # axis=0 按列进行加和
for i in sortindex:
# 构建我们的新特征矩阵(没有被选中去填充的特征们+原始的标签)和新标签(被选中去填充的特征)
df = X_missing_reg
# 新标签
fillc = df.iloc[:,i] # 第六列所有的行
# 新特征矩阵
df = pd.concat([df.iloc[:,df.columns != i],pd.DataFrame(y_full)],axis=1) # 去掉第i列的所有行+原数据集标签
# 在新特征矩阵中,对含有缺失值的列,进行0的填补
'''
#这里的填充值其实可以根据情况来,如果用其他的均值等填补,对应参数也要改
'''
df_0 = SimpleImputer(missing_values=np.nan,strategy='constant',fill_value = fill_value).fit_transform(df)
# 找出我们的训练集和测试集
Ytrain = fillc[fillc.notnull()] # 被选中要填充的特征中(现在是我们的标签),存在的哪些值,非空值
Ytest = fillc[fillc.isnull()] # 是被选中的要填充的特种中(现在是我们的标签),不存在的那些值,空值
# 我们需要的不是Ytest的值,而是Ytest所带的索引
Xtrain = df_0[Ytrain.index,:] # 在新特征矩阵上,被选出来的要填充的特征的非空值所对应的记录
Xtest = df_0[Ytest.index,:] # 新特征矩阵上,被选出来的要填充的特征所对应的空值所对应的记录
# 用随机森林的回归来填补缺失值
from sklearn.ensemble import RandomForestClassifier as rfr
rfc = RandomForestRegressor(n_estimators = 100,random_state=233)#实例化
rfc = rfc.fit(Xtrain,Ytrain) # 导入训练集去进行训练
Ypredict = rfc.predict(Xtest) # 用predict接口将Xtest导入,得到我们的预测结果(回归结果),这个预测结果就是我们要用来填补空值的值
# 将填补好的特征返回到我们的原始特征矩阵中
X_missing_reg.loc[X_missing_reg.iloc[:,i].isnull(),i] = Ypredict
return X_missing_reg
X_missing_reg = fill_missing_rf(X_missing,y_full,0)
X_missing_reg#填补完的数据集可以直接用
XGB.CV学习曲线,回归为例
'''
XGB.CV学习曲线,回归为例
'''
#参数网址:https://xgboost.readthedocs.io/en/latest/parameter.html?highlight=metrics
#load the libraries
from sklearn.datasets import load_boston
import matplotlib.pyplot as plt
import warnings
import xgboost as xgb
warnings.filterwarnings('ignore')
#load data
data = load_boston()
X = data.data
y = data.target
#transform form of data
dfull = xgb.DMatrix(X,y)
#XGB.cv下的学习曲线
#设置待调参数范围
axisx = range(0,20,1)
train_rs = []
test_rs = []
for i in axisx:
#set params
param = {'silent ':False
,'obj': 'reg:linear'
,"gamma":i
}
num_round= 180
n_fold = 5
cvresult = xgb.cv(param,dfull,num_round,n_fold,metrics='rmse')
'''
# 因为xgb.cv返回的是max为num_round数量下每种树数量的4种评估值,
# 行是0~n_round
# 列是 train-rmse-mean train-rmse-std test-rmse-mean test-rmse-std
# 所以取最后一行和对应的列
'''
# print(train_rs,test_rs)
train_rs.append(float(cvresult.iloc[-1:,0]))
test_rs.append(float(cvresult.iloc[-1:,2]))
print('the best effect is {}st on train,the value is {}'.format(axisx[train_rs.index(max(train_rs))],max(train_rs)))
print('the best effect is {}st on test,the value is {}'.format(axisx[test_rs.index(max(test_rs))],max(test_rs)))
plt.figure(figsize=(20,5))
plt.plot(axisx,train_rs,c="red",label="XGB-train")
plt.plot(axisx,test_rs,c="blue",label="XGB-test")
plt.legend()
plt.show()
XGB网格搜索,分类为例
'''
XGB网格搜索,分类为例
'''
%%time
#load libraries
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.datasets import load_iris
#load data
iris = load_iris()
X,y = iris.data,iris.target
col = iris.target_names
train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.3, random_state=1) # 分训练集和验证集
#set params
parameters = {
'max_depth': [5,6],
'learning_rate': [0.01,0.02],
'n_estimators': [500,600],
'min_child_weight': [0,1],
'max_delta_step': [0,1],
'subsample': [0.6,0.7],
'colsample_bytree': [0.5,0.6],
'reg_alpha': [0,1],
'reg_lambda': [0.2,0.3],
'scale_pos_weight': [0.2,0.3]
}
xlf = xgb.XGBClassifier(max_depth=10,
learning_rate=0.01,
n_estimators=2000,
silent=True,
objective='multi:softmax',
num_class=3 ,
nthread=-1,
gamma=0,
min_child_weight=1,
max_delta_step=0,
subsample=0.85,
colsample_bytree=0.7,
colsample_bylevel=1,
reg_alpha=0,
reg_lambda=1,
scale_pos_weight=1,
seed=0,
missing=None)
#scoring:https://blog.csdn.net/u014652309/article/details/84979497
gs = GridSearchCV(xlf, param_grid=parameters, scoring='accuracy', cv=5,n_jobs=-1)
gs.fit(train_x, train_y)
print("Best score: %0.3f" % gs.best_score_)
print("Best parameters set: %s" % gs.best_params_ )
Xgboost使用sklearn的评估接口
'''
使用sklearn的评估接口#菜菜xgboostP43页
'''
#导入模型评估指标
from sklearn.metrics import accuracy_score as accuracy, recall_score as recall,
roc_auc_score as auc
for name,i in zip(names,scale_pos_weight):
param= {'silent':True,'objective':'binary:logistic'
,"eta":0.1,"scale_pos_weight":i}
clf = xgb.train(param, dtrain, num_round)
preds = clf.predict(dtest)
ypred = preds.copy()
ypred[preds > 0.5] = 1
ypred[ypred != 1] = 0
print(name)
print("\tAccuracy:{}".format(accuracy(Ytest,ypred)))
print("\tRecall:{}".format(recall(Ytest,ypred)))
print("\tAUC:{}".format(auc(Ytest,preds)))
Xgboost使用Hyperopt调参
'''steo0
#导入模型评估指标
#load libraries
'''
from sklearn.metrics import roc_auc_score
from hyperopt import fmin, tpe, hp, partial
import xgboost as xgb
'''
step1 load data(注意训练和测试集)
'''
dtrain = xgb.DMatrix(data=Xtrain,label=Ytrain)
dtest = xgb.DMatrix(data=Xtest,label=Ytest)
evallist = [(dtest, 'eval'), (dtrain, 'train')]
'''
step234的开启参数一定要一致
step2设置hyperopt的参数空间(格式范围为下限,上限)
PS:
①最低2个参数空间
②固定参数不想调的参数可以用极小的范围来变相固定
'''
#
space = {
# 参数格式hp.randint,hp.uniform,测试下来max_depth+lambda效果最好
"n_estimators": hp.randint("n_estimators",150,250)
# ,'learning_rate': hp.uniform('learning_rate',2e-1, (5e-1)+(1e-12))#学习[0,1]
,'learning_rate': hp.uniform('learning_rate',2e-2, 5e-2)#学习[0,1]
# #or
,'max_depth': hp.randint("max_depth",15,25) # 最大深度
,'gamma': hp.uniform('gamma', 10,20) # 是否后剪枝[0,无穷]
# #&
,'subsample': hp.uniform('subsample', 2e-1, 4e-1) # 采样数(0,1]
,'colsample_bytree': hp.uniform('colsample_bytree', 1e-1, 4e-1) # 样本列采样[0,1]
# #or
,'alpha': hp.randint("alpha",0,20) # L1 正则化[0,无穷大]
,'lambda': hp.randint("lambda",0,30) # L2 正则化[0,无穷大]
}
'''
step3 设置XGBOOST工厂模式
PS:
①注意开启的参数空间要和hypter一致
'''
def xgboost_factory(argsDict):
argsDict = argsDict_tranform(argsDict)
params = {
'nthread': -1 # 进程数
,'silent': True
,'objective': 'binary:hinge'
,'booster':'dart'
#&
,'n_estimators': argsDict['n_estimators'] # 树的数量
,'eta': argsDict['learning_rate'] # 学习率
# #or
,'max_depth': argsDict['max_depth'] # 最大深度
,'gamma': argsDict['gamma'] # 是否后剪枝
# #&
,'subsample': argsDict['subsample'] # 采样数
,'colsample_bytree': argsDict['colsample_bytree'] # 样本列采样
# #or
,'alpha': argsDict['alpha'] # L1 正则化
,'lambda': argsDict['lambda'] # L2 正则化
# #other params
,'min_child_weight': 1 # 终点节点最小样本占比的和
,'scale_pos_weight': 0 # 取值>0时,在数据不平衡时有助于收敛
,'seed': 233 # 随机种子
,'missing': -999 # 填充缺失值
}
#这里要替换
params['eval_metric'] = ['auc']
xrf = xgb.train(params, dtrain, params['n_estimators'], evallist,early_stopping_rounds=100)
return get_tranformer_score(xrf)
'''
step4 转换参数范围
PS:
①注意开启的参数空间要和hypter一致
'''
def argsDict_tranform(argsDict, isPrint=False):
argsDict['n_estimators'] = argsDict['n_estimators']
argsDict["learning_rate"] = argsDict["learning_rate"]
#ps:下面的要转换格式
# #or
argsDict["max_depth"] = argsDict["max_depth"] + 5 # 最大深度
argsDict['gamma']= argsDict['gamma'] # 是否后剪枝
# #&
argsDict['subsample'] = argsDict['subsample'] # 采样数
argsDict['colsample_bytree'] = argsDict['colsample_bytree'] # 样本列采样
# #or
argsDict['alpha'] = argsDict['alpha'] # L1 正则化
argsDict['lambda']= argsDict['lambda'] # L2 正则化
if isPrint:
print(argsDict)
else:
pass
return argsDict
'''
step5 获取最好分数
PS:
'''
def get_tranformer_score(tranformer):
xrf = tranformer
dpredict = xgb.DMatrix(Xtest)#注意这里是Xtest
prediction = xrf.predict(dpredict, ntree_limit=xrf.best_ntree_limit)
return r2_score(Ytest, prediction)#注意这里评分要自己改
# 开始使用hyperopt进行自动调参
algo = partial(tpe.suggest, n_startup_jobs=1)
best = fmin(xgboost_factory, space, algo=algo, max_evals=20, pass_expr_memo_ctrl=None)
'''
step6 打印最好分数
PS:
'''
roc_auc_score = xgboost_factory(best) #这里根据评分改
print('best :', best)
print('best param after transform :')
argsDict_tranform(best,isPrint=True)
# print('rmse of the best xgboost:', np.sqrt(RMSE))
print('roc_auc_score of the best xgboost:', roc_auc_score)#这里根据评分改
stacking模型融合
第一层基模型过程
第二层元模型过程
'''
data & models & n_folds
'''
#data
ntrain =train.shape[0]
ntest = test.shape[0]
#models
clf1 = GBDT(n_estimators=100)
clf2 = RF(n_estimators=100)
#n_folds & kf
n_folds = 6
'''
网上用kf和skf的都有,个人认为在这里应该是skf,因为skf是根据标签中的分类比例来划分,
这样哪怕在数据不均衡的时候还是按比例划分折数,否则的话28这这样的情况在KF划5折,
可能第1折里少数类就被全部划走了
'''
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=2333)#分层抽样
#kf=KFold(n_split = n_folds,random_state=2333)
'''
定义stacking函数
'''
def get_oof(clf, x_train, y_train, x_test):
oof_train = np.zeros((ntrain,))
oof_test = np.zeros((ntest,))
oof_test_skf = np.empty((n_folds, ntest)) #n_folds行,ntest列的二维array
#留1训练预测执行i次(图片上半部分)
for i, (train_index, test_index) in enumerate(skf.split(X_train,y_train)): #循环n_folds次
x_tr = x_train[train_index]
y_tr = y_train[train_index]
x_te = x_train[test_index]
clf.fit(x_tr, y_tr)
#取出1生成stacking训练数据集
oof_train[test_index] = clf.predict(x_te)
oof_test_skf[i, :] = clf.predict(x_test) #固定行填充,循环一次,填充一行
#stacking测试集求平均(图片下半部分)
oof_test[:] = oof_test_skf.mean(axis=0) #axis=0,按列求平均,最后保留一行
# print("分类器:{}".format(clf))
return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1) #转置,从一行变为一列
'''
使用oof_train和oof_test进行用逻辑回归第二层的训练
'''
参考:Stacking方法详解含mlxtend代码
备注:使用第二个use_probas = True,average_probas = False的代码
参考2:voting & baggling & stacking画图代码