比赛地址:https://www.kaggle.com/c/ieee-fraud-detection
1 Extensive EDA and Modeling XGB Hyperopt
第一个kernel
https://www.kaggle.com/kabure/extensive-eda-and-modeling-xgb-hyperopt
1.1 数据集减重
def reduce_mem_usage(df, verbose=True):
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
start_mem = df.memory_usage().sum() / 1024**2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
return df
代码解读:
因为不同的int数据所占的字节不同,int64的最大值为2的63次方-1,他在机器中占了64个字节数,int8的最大值为2的7次方-1 = 127,他只会占用8个字节,如果一个数是56,却用了int64来表示,那么这个数占用了更多的内存空间,所以我们需要将空间释放掉,方式就是将其转化为int8,如果一个数是int64型,值为130,那么不能转化为int8,但是可以转化为int16,所以我们可以按照这样的原理,来将内存释放掉,float数值同理。
1.2 找出离群值
def CalcOutliers(df_num):
# calculating mean and std of the array
data_mean, data_std = np.mean(df_num), np.std(df_num)
# seting the cut line to both higher and lower values
# You can change this value
cut = data_std * 3
#Calculating the higher and lower cut values
lower, upper = data_mean - cut, data_mean + cut
# creating an array of lower, higher and total outlier values
outliers_lower = [x for x in df_num if x < lower]
outliers_higher = [x for x in df_num if x > upper]
outliers_total = [x for x in df_num if x < lower or x > upper]
# array without outlier values
outliers_removed = [x for x in df_num if x > lower and x < upper]
print('Identified lowest outliers: %d' % len(outliers_lower)) # printing total number of values in lower cut of outliers
print('Identified upper outliers: %d' % len(outliers_higher)) # printing total number of values in higher cut of outliers
print('Total outlier observations: %d' % len(outliers_total)) # printing total number of values outliers of both sides
print('Non-outlier observations: %d' % len(outliers_removed)) # printing total number of non outlier values
print("Total percentual of Outliers: ", round((len(outliers_total) / len(outliers_removed) )*100, 4)) # Percentual of outliers in points
return
代码解读:
大于平均数+3倍的标准差为较大值,小于平均数+3倍的标准差为较小值
1.3 绘图
df_trans['TransactionAmt'] = df_trans['TransactionAmt'].astype(float)
total = len(df_trans)
total_amt = df_trans.groupby(['isFraud'])['TransactionAmt'].sum().sum()
plt.figure(figsize=(16,6))
plt.subplot(121)
g = sns.countplot(x='isFraud', data=df_trans, )
g.set_title("Fraud Transactions Distribution \n# 0: No Fraud | 1: Fraud #", fontsize=22)
g.set_xlabel("Is fraud?", fontsize=18)
g.set_ylabel('Count', fontsize=18)
for p in g.patches:
height = p.get_height()
g.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/total*100),
ha="center", fontsize=15)
perc_amt = (df_trans.groupby(['isFraud'])['TransactionAmt'].sum())
perc_amt = perc_amt.reset_index()
plt.subplot(122)
g1 = sns.barplot(x='isFraud', y='TransactionAmt', dodge=True, data=perc_amt)
g1.set_title("% Total Amount in Transaction Amt \n# 0: No Fraud | 1: Fraud #", fontsize=22)
g1.set_xlabel("Is fraud?", fontsize=18)
g1.set_ylabel('Total Transaction Amount Scalar', fontsize=18)
for p in g1.patches:
height = p.get_height()
g1.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/total_amt * 100),
ha="center", fontsize=15)
plt.show()
tmp = pd.crosstab(df_trans['ProductCD'], df_trans['isFraud'], normalize='index') * 100
tmp = tmp.reset_index()
tmp.rename(columns={0:'NoFraud', 1:'Fraud'}, inplace=True)
plt.figure(figsize=(14,10))
plt.suptitle('ProductCD Distributions', fontsize=22)
plt.subplot(221)
g = sns.countplot(x='ProductCD', data=df_trans)
# plt.legend(title='Fraud', loc='upper center', labels=['No', 'Yes'])
g.set_title("ProductCD Distribution", fontsize=19)
g.set_xlabel("ProductCD Name", fontsize=17)
g.set_ylabel("Count", fontsize=17)
g.set_ylim(0,500000)
for p in g.patches:
height = p.get_height()
g.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/total*100),
ha="center", fontsize=14)
plt.subplot(222)
g1 = sns.countplot(x='ProductCD', hue='isFraud', data=df_trans)
plt.legend(title='Fraud', loc='best', labels=['No', 'Yes'])
gt = g1.twinx()
gt = sns.pointplot(x='ProductCD', y='Fraud', data=tmp, color='black', order=['W', 'H',"C", "S", "R"], legend=False)
gt.set_ylabel("% of Fraud Transactions", fontsize=16)
g1.set_title("Product CD by Target(isFraud)", fontsize=19)
g1.set_xlabel("ProductCD Name", fontsize=17)
g1.set_ylabel("Count", fontsize=17)
plt.subplot(212)
g3 = sns.boxenplot(x='ProductCD', y='TransactionAmt', hue='isFraud',
data=df_trans[df_trans['TransactionAmt'] <= 2000] )
g3.set_title("Transaction Amount Distribuition by ProductCD and Target", fontsize=20)
g3.set_xlabel("ProductCD Name", fontsize=17)
g3.set_ylabel("Transaction Values", fontsize=17)
plt.subplots_adjust(hspace = 0.6, top = 0.85)
plt.show()
1.4 N分位
print("Card Features Quantiles: ")
print(df_trans[['card1', 'card2', 'card3', 'card5']].quantile([0.01, .025, .1, .25, .5, .75, .975, .99]))
Card Features Quantiles:
card1 card2 card3 card5
0.010 1338.0 100.0 144.0 102.0
0.025 1675.0 111.0 150.0 102.0
0.100 2803.0 111.0 150.0 126.0
0.250 6019.0 214.0 150.0 166.0
0.500 9678.0 361.0 150.0 226.0
0.750 14184.0 512.0 150.0 226.0
0.975 18018.0 583.0 185.0 226.0
0.990 18238.0 583.0 185.0 226.0
1.5 Hyperopt调参
https://blog.csdn.net/u012735708/article/details/84820101
1.6 思路整理
- 降低数据内存消耗
- 大概了解数据信息
- 对TransactionAmt列进行画图分析、N位数查看、和label合并画图、查看离群值。
- 对同类的列进行以上的操作
- 处理缺失值
- 对部分object列进行归类整理
- 对部分列进行映射处理
- 合并两个训练数据
- label encoder 和 pca
- 特征工程:
10.1 创建新特征:用减平均数和减标准差的方法
10.2 用除平均数和除标准差的方法
10.3 取对数 - 建模调参