金融风控Task6学习笔记
一、学习知识点概要
这次是提交了比赛结果,完成了整个赛题,算是这段学习的一个结尾吧。
二、学习内容(部分代码)
xgb模型
导入需要的库
pip install bayesian-optimization
import pandas as pd
import numpy as np
import warnings
import os
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import cross_val_score
读取文件
train1=pd.read_csv('d1_train.csv')
test_a1=pd.read_csv('d1_testa.csv')
压缩数据(预处理)
def reduce_mem_usage(df):
start_mem = df.memory_usage().sum()
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else:
df[col] = df[col].astype('category')
end_mem = df.memory_usage().sum()
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
train1 = reduce_mem_usage(train1)
test_a1 = reduce_mem_usage(test_a1)
训练数据
X_train = train1.drop('isDefault', axis=1)
X_test = test_a1
y_train = train1['isDefault']
五折交叉验证
from sklearn.model_selection import KFold
folds = 5
seed = 2020
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
训练模型与保存
"""对训练集数据进行划分,分成训练集和验证集,并进行相应的操作"""
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
cv_scores = []
for i, (train_index, valid_index) in enumerate(kf.split(X_train, y_train)):
print('************************************ {} ************************************'.format(str(i+1)))
X_train_split, y_train_split, X_val, y_val = X_train.iloc[train_index], y_train[train_index], X_train.iloc[valid_index], y_train[valid_index]
dtrain=xgb.DMatrix(X_train_split,label=y_train_split)
dtest=xgb.DMatrix(X_val)
params={'booster':'gbtree',
'objective': 'binary:logistic',
'eval_metric': 'auc',
'max_depth':4,
'lambda':10,
'subsample':0.75,
'colsample_bytree':0.75,
'min_child_weight':2,
'eta': 0.025,
'seed':0,
'nthread':8,
'silent':1}
watchlist = [(dtrain,'train')]
model2=xgb.train(params,dtrain,num_boost_round=5,evals=watchlist)
#输出概率
val_pred=model2.predict(dtest)
cv_scores.append(roc_auc_score(y_val, val_pred))
print(cv_scores)
print("xgb_scotrainre_list:{}".format(cv_scores))
print("xgb_score_mean:{}".format(np.mean(cv_scores)))
print("xgb_score_std:{}".format(np.std(cv_scores)))
#保存模型
import pickle
# save model to file
pickle.dump(model2, open("pima.pickle.dat", "wb"))
# load model from file
loaded_model2 = pickle.load(open("pima.pickle.dat", "rb"))
三、问题与解决
Found arrays with inconsistent numbers of samples
链接: https://blog.csdn.net/weixin_37029453/article/details/78896425.
expected an indented block
Python语言是一款对缩进非常敏感的语言,Tab和空格的混用会导致错误,或者缩进不对,这是无法观测到的,在编译时会出现这样的错expected an indented block说明此处需要缩进,你只要在出现错误的那一行,按空格或Tab(但不能混用)键缩进就行。
四、思考与总结
其实像这样的学习还是实践最为重要,将代码自己敲打并运行一遍,根据需求去修改这些代码,当然可能会有一些错误出现,可以自行搜索解决办法,这样对代码的理解程度也会高一些。